1111 get_points_array ,
1212 WSP2W ,
1313)
14-
1514import copy
1615from warnings import warn as Warn
1716from scipy .spatial import distance_matrix
@@ -84,7 +83,20 @@ class KNN(W):
8483 Notes
8584 -----
8685
87- Ties between neighbors of equal distance are arbitrarily broken.
86+ Ties between neighbors of equal distance are arbitrarily broken.
87+
88+ Further, if many points occupy the same spatial location (i.e. observations are
89+ coincident), then you may need to increase k for those observations to
90+ acquire neighbors at different spatial locations. For example, if five
91+ points are coincident, then their four nearest neighbors will all
92+ occupy the same spatial location; only the fifth nearest neighbor will
93+ result in those coincident points becoming connected to the graph as a
94+ whole.
95+
96+ Solutions to this problem include jittering the points (by adding
97+ a small random value to each observation's location) or by adding
98+ higher-k neighbors only to the coincident points, using the
99+ weights.w_sets.w_union() function.
88100
89101 See Also
90102 --------
@@ -111,19 +123,30 @@ def __init__(
111123 self .data = self .kdtree .data
112124 self .k = k
113125 self .p = p
114- this_nnq = self .kdtree .query (self .data , k = k + 1 , p = p )
115126
116- to_weight = this_nnq [1 ]
127+ # these are both n x k+1
128+ distances , indices = self .kdtree .query (self .data , k = k + 1 , p = p )
129+ full_indices = np .arange (self .kdtree .n )
130+
131+ # if an element in the indices matrix is equal to the corresponding
132+ # index for that row, we want to mask that site from its neighbors
133+ not_self_mask = indices != full_indices .reshape (- 1 , 1 )
134+ # if there are *too many duplicates per site*, then we may get some
135+ # rows where the site index is not in the set of k+1 neighbors
136+ # So, we need to know where these sites are
137+ has_one_too_many = not_self_mask .sum (axis = 1 ) == (k + 1 )
138+ # if a site has k+1 neighbors, drop its k+1th neighbor
139+ not_self_mask [has_one_too_many , - 1 ] &= False
140+ not_self_indices = indices [not_self_mask ].reshape (self .kdtree .n , - 1 )
141+
142+ to_weight = not_self_indices
117143 if ids is None :
118- ids = list (range (to_weight .shape [0 ]))
119-
120- neighbors = {}
121- for i , row in enumerate (to_weight ):
122- row = row .tolist ()
123- row .remove (i )
124- row = [ids [j ] for j in row ]
125- focal = ids [i ]
126- neighbors [focal ] = row
144+ ids = list (full_indices )
145+ named_indices = not_self_indices
146+ else :
147+ named_indices = np .asarray (ids )[not_self_indices ]
148+ neighbors = {idx : list (indices ) for idx , indices in zip (ids , named_indices )}
149+
127150 W .__init__ (self , neighbors , id_order = ids , ** kwargs )
128151
129152 @classmethod
@@ -693,6 +716,7 @@ class DistanceBand(W):
693716 threshold : float
694717 distance band
695718 p : float
719+ DEPRECATED: use `distance_metric`
696720 Minkowski p-norm distance metric parameter:
697721 1<=p<=infinity
698722 2: Euclidean distance
@@ -709,6 +733,7 @@ class DistanceBand(W):
709733 values to use for keys of the neighbors and weights dicts
710734
711735 build_sp : boolean
736+ DEPRECATED
712737 True to build sparse distance matrix and false to build dense
713738 distance matrix; significant speed gains may be obtained
714739 dending on the sparsity of the of distance_matrix and
@@ -766,12 +791,6 @@ class DistanceBand(W):
766791 >>> w.weights[0]
767792 [0.01, 0.007999999999999998]
768793
769- Notes
770- -----
771-
772- This was initially implemented running scipy 0.8.0dev (in epd 6.1).
773- earlier versions of scipy (0.7.0) have a logic bug in scipy/sparse/dok.py
774- so serge changed line 221 of that file on sal-dev to fix the logic bug.
775794
776795 """
777796
@@ -821,6 +840,7 @@ def __init__(
821840 else :
822841 self .data = data
823842 self .kdtree = None
843+
824844 self ._band ()
825845 neighbors , weights = self ._distance_to_W (ids )
826846 W .__init__ (
@@ -862,6 +882,7 @@ def from_array(cls, array, threshold, **kwargs):
862882
863883 @classmethod
864884 def from_dataframe (cls , df , threshold , geom_col = None , ids = None , ** kwargs ):
885+
865886 """
866887 Make DistanceBand weights from a dataframe.
867888
0 commit comments