Handle prediction at tree split points separately. Always use qr

andreasnoack · andreasnoack · commit df1bf95dc0a6 · 2020-12-14T21:54:23.000+01:00
factorization when solving the local system to avoid error when
the system is singular. Also fix a one-off error in the median
calculation in the KDTree implementation.
diff --git a/Project.toml b/Project.toml
@@ -4,11 +4,12 @@ version = "0.5.2"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
-julia = "0.7, 1"
 Distances = "0.7, 0.8, 0.9, 0.10"
+julia = "0.7, 1"
 
 [extras]
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/src/Loess.jl b/src/Loess.jl
@@ -2,7 +2,7 @@ module Loess
 
 import Distances.euclidean
 
-using Statistics
+using Statistics, LinearAlgebra
 
 export loess, predict
 
@@ -44,6 +44,9 @@ function loess(xs::AbstractMatrix{T}, ys::AbstractVector{T};
 
     n, m = size(xs)
     q = ceil(Int, (span * n))
+    if q < degree + 1
+        throw(ArgumentError("neighborhood size must be larger than degree+1=$(degree + 1) but was $q. Try increasing the value of span."))
+    end
 
     # TODO: We need to keep track of how we are normalizing so we can
     # correctly apply predict to unnormalized data. We should have a normalize
@@ -53,7 +56,6 @@ function loess(xs::AbstractMatrix{T}, ys::AbstractVector{T};
     end
 
     kdtree = KDTree(xs, 0.05 * span)
-    verts = Array{T}(undef, length(kdtree.verts), m)
 
     # map verticies to their index in the bs coefficient matrix
     verts = Dict{Vector{T}, Int}()
@@ -69,6 +71,7 @@ function loess(xs::AbstractMatrix{T}, ys::AbstractVector{T};
     # TODO: higher degree fitting
     us = Array{T}(undef, q, 1 + degree * m)
     vs = Array{T}(undef, q)
+
     for (vert, k) in verts
         # reset perm
         for i in 1:n
@@ -85,20 +88,22 @@ function loess(xs::AbstractMatrix{T}, ys::AbstractVector{T};
         dmax = maximum([ds[perm[i]] for i = 1:q])
 
         for i in 1:q
-            pi = perm[i]
-            w = tricubic(ds[pi] / dmax)
+            pᵢ = perm[i]
+            w = tricubic(ds[pᵢ] / dmax)
             us[i,1] = w
             for j in 1:m
-                x = xs[pi, j]
+                x = xs[pᵢ, j]
                 wxl = w
                 for l in 1:degree
                     wxl *= x
-                    us[i, 1 + (j-1)*degree + l] = wxl # w*x^l
+                    us[i, 1 + (j - 1)*degree + l] = wxl # w*x^l
                 end
             end
-            vs[i] = ys[pi] * w
+            vs[i] = ys[pᵢ] * w
         end
-        bs[k,:] = us \ vs
+
+        F = qr(us, Val(true))
+        bs[k,:] = F\vs
     end
 
     LoessModel{T}(xs, ys, bs, verts, kdtree)
@@ -149,11 +154,16 @@ function predict(model::LoessModel{T}, zs::AbstractVector{T}) where T <: Abstrac
     if m == 1
         @assert(length(adjacent_verts) == 2)
         z = zs[1]
-        u = (z - adjacent_verts[1][1]) /
-        (adjacent_verts[2][1] - adjacent_verts[1][1])
+        v₁, v₂ = adjacent_verts[1][1], adjacent_verts[2][1]
+
+        if z == v₁ || z == v₂
+            return evalpoly(zs, model.bs[model.verts[[z]],:])
+        end
+
+        u = (z - v₁)/(v₂ - v₁)
 
-        y1 = evalpoly(zs, model.bs[model.verts[[adjacent_verts[1][1]]],:])
-        y2 = evalpoly(zs, model.bs[model.verts[[adjacent_verts[2][1]]],:])
+        y1 = evalpoly(zs, model.bs[model.verts[[v₁]],:])
+        y2 = evalpoly(zs, model.bs[model.verts[[v₂]],:])
         return (1.0 - u) * y1 + u * y2
     else
         error("Multivariate blending not yet implemented")
diff --git a/src/kd.jl b/src/kd.jl
@@ -143,7 +143,7 @@ function build_kdtree(xs::AbstractMatrix{T},
 
     # find the median and partition
     if isodd(length(perm))
-        mid = length(perm) ÷ 2
+        mid = (length(perm) + 1) ÷ 2
         partialsort!(perm, mid, by=i -> xs[i, j])
         med = xs[perm[mid], j]
         mid1 = mid
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -32,3 +32,24 @@ let x = 1:10, y = sin.(1:10)
 end
 
 @test_throws DimensionMismatch loess([1.0 2.0; 3.0 4.0], [1.0])
+
+@testset "Issue 28" begin
+    @testset "Example 1" begin
+        x = [1.0, 2.0, 3.0, 4.0]
+        y = [1.0, 2.0, 3.0, 4.0]
+        @test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 1. Try increasing the value of span.") loess(x, y, span = 0.25)
+        @test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 2. Try increasing the value of span.") loess(x, y, span = 0.33)
+        @test predict(loess(x, y), x) ≈ x
+    end
+
+    @testset "Example 2" begin
+        x = [1.0, 1.0, 2.0, 3.0, 4.0, 4.0]
+        y = [1.0, 1.0, 2.0, 3.0, 4.0, 4.0]
+        @test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 2. Try increasing the value of span.") loess(x, y, span = 0.33)
+        # For 0.4 and 0.5 these current don't hit the middle values. I suspect
+        # the issue is related to the ties in x.
+        @test_broken predict(loess(x, y, span = 0.4), x) ≈ x
+        @test_broken predict(loess(x, y, span = 0.5), x) ≈ x
+        @test predict(loess(x, y, span = 0.6), x) ≈ x
+    end
+end