1010
1111from ..distances import get_distance_matrix
1212
13+ try :
14+ from sklearn .cluster import KMeans
15+ from sklearn .utils .validation import check_array
16+ HAS_SKLEARN = True
17+ except ImportError :
18+ HAS_SKLEARN = False
19+
1320
1421def initialize_centroids (points : np .ndarray , k : int , random_state : int | None = None ) -> np .ndarray :
1522 """
@@ -55,6 +62,98 @@ def move_centroids(points: np.ndarray, closest: np.ndarray, centroids: np.ndarra
5562 return np .array (new_centroids )
5663
5764
65+ class CustomKMeans (KMeans if HAS_SKLEARN else object ):
66+ """
67+ Custom K-means implementation that supports geographic distance metrics.
68+
69+ This class extends sklearn's KMeans to work with custom distance functions
70+ including haversine, OSRM, and Google Maps API distances.
71+ """
72+
73+ def __init__ (self , n_clusters = 8 , distance_method = "euclidean" , max_iter = 300 , random_state = None , ** distance_kwargs ):
74+ if HAS_SKLEARN :
75+ # Initialize sklearn KMeans with all parameters
76+ super ().__init__ (n_clusters = n_clusters , max_iter = max_iter , random_state = random_state )
77+ self .distance_method = distance_method
78+ self .distance_kwargs = distance_kwargs
79+ self .n_clusters = n_clusters
80+
81+ def _transform (self , X ):
82+ """Override sklearn's distance calculation to use custom metrics."""
83+ if not HAS_SKLEARN :
84+ raise ImportError ("sklearn is required for CustomKMeans. Install with: pip install 'allocator[algorithms]'" )
85+
86+ # Use our custom distance factory instead of sklearn's euclidean
87+ distances = get_distance_matrix (X , self .cluster_centers_ ,
88+ method = self .distance_method ,
89+ ** self .distance_kwargs )
90+ return distances
91+
92+ def _update_centroids (self , X , labels ):
93+ """Update centroids using geographic mean for custom distances."""
94+ new_centroids = []
95+ for k in range (self .n_clusters ):
96+ mask = labels == k
97+ if np .any (mask ):
98+ # For geographic data, use simple mean of coordinates
99+ # This works well for most geographic clustering tasks
100+ cluster_points = X [mask ]
101+ centroid = np .mean (cluster_points , axis = 0 )
102+ new_centroids .append (centroid )
103+ else :
104+ # Keep old centroid if cluster is empty
105+ new_centroids .append (self .cluster_centers_ [k ])
106+ return np .array (new_centroids )
107+
108+ def fit (self , X , y = None , sample_weight = None ):
109+ """Fit the k-means clustering with custom distance metric."""
110+ if not HAS_SKLEARN :
111+ # Fallback to original implementation if sklearn not available
112+ return self ._fit_custom_implementation (X )
113+
114+ X = check_array (X , accept_sparse = 'csr' , dtype = [np .float64 , np .float32 ])
115+
116+ # Initialize using sklearn's initialization logic
117+ super ().fit (X )
118+
119+ # Now run our custom iterations
120+ for iteration in range (self .max_iter ):
121+ # Calculate distances using custom metric
122+ distances = get_distance_matrix (X , self .cluster_centers_ ,
123+ method = self .distance_method ,
124+ ** self .distance_kwargs )
125+
126+ # Assign points to nearest centroids
127+ labels = np .argmin (distances , axis = 1 )
128+
129+ # Update centroids
130+ new_centroids = self ._update_centroids (X , labels )
131+
132+ # Check convergence
133+ if np .allclose (self .cluster_centers_ , new_centroids , rtol = 1e-4 ):
134+ self .cluster_centers_ = new_centroids
135+ self .labels_ = labels
136+ self .n_iter_ = iteration + 1
137+ break
138+
139+ self .cluster_centers_ = new_centroids
140+ else :
141+ self .labels_ = labels
142+ self .n_iter_ = self .max_iter
143+
144+ return self
145+
146+ def _fit_custom_implementation (self , X ):
147+ """Fallback to original implementation when sklearn is not available."""
148+ result = _kmeans_cluster_original (X , self .n_clusters ,
149+ distance_method = self .distance_method ,
150+ ** self .distance_kwargs )
151+ self .cluster_centers_ = result ["centroids" ]
152+ self .labels_ = result ["labels" ]
153+ self .n_iter_ = result ["iterations" ]
154+ return self
155+
156+
58157def kmeans_cluster (
59158 data : pd .DataFrame | np .ndarray ,
60159 n_clusters : int ,
@@ -64,10 +163,55 @@ def kmeans_cluster(
64163 ** distance_kwargs ,
65164) -> dict :
66165 """
67- Pure K-means clustering implementation.
166+ K-means clustering with support for custom distance metrics.
167+
168+ This function provides a unified interface that uses sklearn when available
169+ and falls back to the original implementation otherwise.
170+ """
171+ # Convert DataFrame to numpy array if needed
172+ if isinstance (data , pd .DataFrame ):
173+ if "longitude" in data .columns and "latitude" in data .columns :
174+ X = data [["longitude" , "latitude" ]].values
175+ else :
176+ raise ValueError ("DataFrame must contain 'longitude' and 'latitude' columns" )
177+ else :
178+ X = np .asarray (data )
179+
180+ # Use sklearn-based implementation if available
181+ if HAS_SKLEARN and distance_method in ["euclidean" , "haversine" , "osrm" , "google" ]:
182+ kmeans = CustomKMeans (
183+ n_clusters = n_clusters ,
184+ distance_method = distance_method ,
185+ max_iter = max_iter ,
186+ random_state = random_state ,
187+ ** distance_kwargs
188+ )
189+ kmeans .fit (X )
190+
191+ return {
192+ "labels" : kmeans .labels_ ,
193+ "centroids" : kmeans .cluster_centers_ ,
194+ "iterations" : kmeans .n_iter_ ,
195+ "converged" : kmeans .n_iter_ < max_iter ,
196+ }
197+
198+ # Fall back to original implementation
199+ return _kmeans_cluster_original (X , n_clusters , distance_method , max_iter , random_state , ** distance_kwargs )
200+
201+
202+ def _kmeans_cluster_original (
203+ data : np .ndarray ,
204+ n_clusters : int ,
205+ distance_method : str = "euclidean" ,
206+ max_iter : int = 300 ,
207+ random_state : int | None = None ,
208+ ** distance_kwargs ,
209+ ) -> dict :
210+ """
211+ Original pure K-means clustering implementation (fallback).
68212
69213 Args:
70- data: Input data as DataFrame with longitude/latitude or numpy array [n, 2]
214+ data: Input data as numpy array [n, 2]
71215 n_clusters: Number of clusters
72216 distance_method: Distance calculation method
73217 max_iter: Maximum iterations
@@ -77,14 +221,7 @@ def kmeans_cluster(
77221 Returns:
78222 Dictionary with 'labels', 'centroids', 'iterations', 'converged'
79223 """
80- # Convert DataFrame to numpy array if needed
81- if isinstance (data , pd .DataFrame ):
82- if "longitude" in data .columns and "latitude" in data .columns :
83- X = data [["longitude" , "latitude" ]].values
84- else :
85- raise ValueError ("DataFrame must contain 'longitude' and 'latitude' columns" )
86- else :
87- X = np .asarray (data )
224+ X = data
88225
89226 # Initialize centroids
90227 centroids = initialize_centroids (X , n_clusters , random_state )
0 commit comments