@@ -41,7 +41,6 @@ function KDTree(
41
41
) where T <: AbstractFloat
42
42
43
43
n, m = size (xs)
44
- perm = collect (1 : n)
45
44
46
45
bounds = Array {T} (undef, 2 , m)
47
46
for j in 1 : m
@@ -63,14 +62,13 @@ function KDTree(
63
62
push! (verts, T[vert... ])
64
63
end
65
64
65
+ perm = collect (1 : n)
66
66
root = build_kdtree (xs, perm, bounds, leaf_size_cutoff, leaf_diameter_cutoff, verts)
67
67
68
- KDTree (convert (Matrix{T}, xs), collect ( 1 : n) , root, verts, bounds)
68
+ KDTree (convert (Matrix{T}, xs), perm , root, verts, bounds)
69
69
end
70
70
71
71
72
-
73
-
74
72
"""
75
73
diameter(bounds)
76
74
@@ -88,6 +86,32 @@ function diameter(bounds::Matrix)
88
86
euclidean (vec (bounds[1 ,:]), vec (bounds[2 ,:]))
89
87
end
90
88
89
+ """
90
+ _select_j(xs::AbstractMatrix{T})
91
+
92
+ Select the column for sorting the rows xs based on the column with the largest spread.
93
+ """
94
+ function _select_j (xs:: AbstractMatrix{T} ) where {T <: AbstractFloat }
95
+ size (xs, 2 ) == 1 && return 1
96
+
97
+ # split on the dimension with the largest spread
98
+ # maxspread, j = findmax(maximum(xs[perm, k]) - minimum(xs[perm, k]) for k in 1:m)
99
+ j = 1
100
+ maxspread = 0
101
+ @inbounds for k in axes (xs, 2 )
102
+ xmin = Inf
103
+ xmax = - Inf
104
+ @inbounds for i in axes (xs, 1 )
105
+ xmin = min (xmin, xs[i, k])
106
+ xmax = max (xmax, xs[i, k])
107
+ end
108
+ if xmax - xmin > maxspread
109
+ maxspread = xmax - xmin
110
+ j = k
111
+ end
112
+ end
113
+ return j
114
+ end
91
115
92
116
"""
93
117
build_kdtree(xs, perm, bounds, leaf_size_cutoff, leaf_diameter_cutoff, verts)
@@ -121,30 +145,22 @@ function build_kdtree(xs::AbstractMatrix{T},
121
145
Base. require_one_based_indexing (xs)
122
146
Base. require_one_based_indexing (perm)
123
147
148
+ j = _select_j (xs)
124
149
n, m = size (xs)
150
+ # performance testing showed that sorting everything at once was dramatically faster
151
+ # than repeated partial sorting with partialsort! when there are ties:
152
+ # https://github.com/JuliaStats/Loess.jl/pull/74
153
+ if ! issorted (view (xs, perm, j))
154
+ @debug " received unsorted data, sorting"
155
+ sortperm! (perm, view (xs, :, j))
156
+ end
157
+ xjs = view (xs, perm, j)
125
158
126
159
if length (perm) <= leaf_size_cutoff || diameter (bounds) <= leaf_diameter_cutoff
127
160
@debug " Creating leaf node" length (perm) leaf_size_cutoff diameter (bounds) leaf_diameter_cutoff
128
161
return nothing
129
162
end
130
163
131
- # split on the dimension with the largest spread
132
- # maxspread, j = findmax(maximum(xs[perm, k]) - minimum(xs[perm, k]) for k in 1:m)
133
- j = 1
134
- maxspread = 0
135
- for k in 1 : m
136
- xmin = Inf
137
- xmax = - Inf
138
- for i in perm
139
- xmin = min (xmin, xs[i, k])
140
- xmax = max (xmax, xs[i, k])
141
- end
142
- if xmax - xmin > maxspread
143
- maxspread = xmax - xmin
144
- j = k
145
- end
146
- end
147
-
148
164
# Find the "median" and partition
149
165
#
150
166
# The aim of the algorithm is to split the data recursively in two roughly equally sized
@@ -165,37 +181,36 @@ function build_kdtree(xs::AbstractMatrix{T},
165
181
#
166
182
# The details here are reversed engineered from the C/Fortran implementation wrapped
167
183
# by R and also distribtued on NETLIB.
168
- mid = (length (perm ) + 1 ) ÷ 2
169
- @debug " Candidate median index and median value" mid xs[perm[ mid], j ]
184
+ mid = (length (xjs ) + 1 ) ÷ 2
185
+ @debug " Candidate median index and median value" mid xjs[ mid]
170
186
171
187
offset = 0
172
188
local mid1, mid2
173
189
while true
174
190
mid1 = mid + offset
175
191
mid2 = mid1 + 1
176
192
if mid1 < 1
177
- @debug " mid1 is zero. All elements are identical. Creating vertex and then two leaves" mid1 length (perm) xs[perm[ mid], j ]
193
+ @debug " mid1 is zero. All elements are identical. Creating vertex and then two leaves" mid1 length (xjs) xjs[ mid]
178
194
offset = mid1 = 0
179
- mid2 = length (perm ) + 1
195
+ mid2 = length (xjs ) + 1
180
196
break
181
197
end
182
- if mid2 > length (perm )
183
- @debug " mid2 is out of bounds. Continuing with negative offset" mid2 length (perm ) offset
198
+ if mid2 > length (xjs )
199
+ @debug " mid2 is out of bounds. Continuing with negative offset" mid2 length (xjs ) offset
184
200
# This makes the offset 0, 1, -1, 2, -2, ...
185
201
offset = - offset + (offset <= 0 )
186
202
continue
187
203
end
188
- p12 = partialsort! (perm, mid1: mid2, by = i -> xs[i, j])
189
- if xs[p12[1 ], j] == xs[p12[2 ], j]
190
- @debug " tie! Adjusting offset" xs[p12[1 ], j] xs[p12[2 ], j] offset
204
+ if xjs[mid1] == xjs[mid2]
205
+ # @debug "tie! Adjusting offset" xs[p12[1], j] xs[p12[2], j] offset
191
206
# This makes the offset 0, 1, -1, 2, -2, ...
192
207
offset = - offset + (offset <= 0 )
193
208
else
194
209
break
195
210
end
196
211
end
197
212
mid += offset
198
- med = xs[perm[ mid], j ]
213
+ med = xjs[ mid]
199
214
@debug " Accepted median index and median value" mid med
200
215
201
216
leftbounds = copy (bounds)
0 commit comments