Skip to content

improve performance with ties #74

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Aug 23, 2023
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "Loess"
uuid = "4345ca2d-374a-55d4-8d30-97f9976e7612"
version = "0.6.1"
version = "0.6.2"

[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Expand Down
77 changes: 46 additions & 31 deletions src/kd.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ function KDTree(
) where T <: AbstractFloat

n, m = size(xs)
perm = collect(1:n)

bounds = Array{T}(undef, 2, m)
for j in 1:m
Expand All @@ -63,14 +62,13 @@ function KDTree(
push!(verts, T[vert...])
end

perm = collect(1:n)
root = build_kdtree(xs, perm, bounds, leaf_size_cutoff, leaf_diameter_cutoff, verts)

KDTree(convert(Matrix{T}, xs), collect(1:n), root, verts, bounds)
KDTree(convert(Matrix{T}, xs), perm, root, verts, bounds)
end




"""
diameter(bounds)

Expand All @@ -88,6 +86,32 @@ function diameter(bounds::Matrix)
euclidean(vec(bounds[1,:]), vec(bounds[2,:]))
end

"""
_select_j(xs::AbstractMatrix{T})

Select the column for sorting the rows xs based on the column with the largest spread.
"""
function _select_j(xs::AbstractMatrix{T}) where {T <: AbstractFloat}
size(xs, 2) == 1 && return 1

# split on the dimension with the largest spread
# maxspread, j = findmax(maximum(xs[perm, k]) - minimum(xs[perm, k]) for k in 1:m)
j = 1
maxspread = 0
@inbounds for k in axes(xs, 2)
xmin = Inf
xmax = -Inf
@inbounds for i in axes(xs, 1)
xmin = min(xmin, xs[i, k])
xmax = max(xmax, xs[i, k])
end
if xmax - xmin > maxspread
maxspread = xmax - xmin
j = k
end
end
return j
end

"""
build_kdtree(xs, perm, bounds, leaf_size_cutoff, leaf_diameter_cutoff, verts)
Expand Down Expand Up @@ -121,30 +145,22 @@ function build_kdtree(xs::AbstractMatrix{T},
Base.require_one_based_indexing(xs)
Base.require_one_based_indexing(perm)

j = _select_j(xs)
n, m = size(xs)
# performance testing showed that sorting everything at once was dramatically faster
# than repeated partial sorting with partialsort! when there are ties:
# https://github.com/JuliaStats/Loess.jl/pull/74
if !issorted(view(xs, perm, j))
@debug "received unsorted data, sorting"
sortperm!(perm, view(xs, :, j))
end
xjs = view(xs, perm, j)

if length(perm) <= leaf_size_cutoff || diameter(bounds) <= leaf_diameter_cutoff
@debug "Creating leaf node" length(perm) leaf_size_cutoff diameter(bounds) leaf_diameter_cutoff
return nothing
end

# split on the dimension with the largest spread
# maxspread, j = findmax(maximum(xs[perm, k]) - minimum(xs[perm, k]) for k in 1:m)
j = 1
maxspread = 0
for k in 1:m
xmin = Inf
xmax = -Inf
for i in perm
xmin = min(xmin, xs[i, k])
xmax = max(xmax, xs[i, k])
end
if xmax - xmin > maxspread
maxspread = xmax - xmin
j = k
end
end

# Find the "median" and partition
#
# The aim of the algorithm is to split the data recursively in two roughly equally sized
Expand All @@ -165,37 +181,36 @@ function build_kdtree(xs::AbstractMatrix{T},
#
# The details here are reversed engineered from the C/Fortran implementation wrapped
# by R and also distribtued on NETLIB.
mid = (length(perm) + 1) ÷ 2
@debug "Candidate median index and median value" mid xs[perm[mid], j]
mid = (length(xjs) + 1) ÷ 2
@debug "Candidate median index and median value" mid xjs[mid]

offset = 0
local mid1, mid2
while true
mid1 = mid + offset
mid2 = mid1 + 1
if mid1 < 1
@debug "mid1 is zero. All elements are identical. Creating vertex and then two leaves" mid1 length(perm) xs[perm[mid], j]
@debug "mid1 is zero. All elements are identical. Creating vertex and then two leaves" mid1 length(xjs) xjs[mid]
offset = mid1 = 0
mid2 = length(perm) + 1
mid2 = length(xjs) + 1
break
end
if mid2 > length(perm)
@debug "mid2 is out of bounds. Continuing with negative offset" mid2 length(perm) offset
if mid2 > length(xjs)
@debug "mid2 is out of bounds. Continuing with negative offset" mid2 length(xjs) offset
# This makes the offset 0, 1, -1, 2, -2, ...
offset = -offset + (offset <= 0)
continue
end
p12 = partialsort!(perm, mid1:mid2, by = i -> xs[i, j])
if xs[p12[1], j] == xs[p12[2], j]
@debug "tie! Adjusting offset" xs[p12[1], j] xs[p12[2], j] offset
if xjs[mid1] == xjs[mid2]
# @debug "tie! Adjusting offset" xs[p12[1], j] xs[p12[2], j] offset
# This makes the offset 0, 1, -1, 2, -2, ...
offset = -offset + (offset <= 0)
else
break
end
end
mid += offset
med = xs[perm[mid], j]
med = xjs[mid]
@debug "Accepted median index and median value" mid med

leftbounds = copy(bounds)
Expand Down
14 changes: 14 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,20 @@ end
@test_broken predict(model, x)[end] ≈ pred[end] atol=1e-5
end

@testset "lots of ties" begin
# adapted from https://github.com/JuliaStats/Loess.jl/pull/74#discussion_r1294303522
x = repeat([π/4*i for i in -20:20], inner=101)
y = sin.(x)

model = loess(x,y; span=0.2)
for i in -3:3
@test predict(model, i * π) ≈ 0 atol=1e-12
# not great tolerance but loess also struggles to capture the sine peaks
@test abs(predict(model, i * π + π / 2 )) ≈ 0.9 atol=0.1
end

end

@test_throws DimensionMismatch loess([1.0 2.0; 3.0 4.0], [1.0])

@testset "Issue 28" begin
Expand Down