Skip to content

Commit 9dc49fe

Browse files
committed
deptry approved, move to sciki-learn for k-means
1 parent bbd97cb commit 9dc49fe

14 files changed

Lines changed: 1308 additions & 456 deletions

CHANGELOG.md

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,86 @@
22

33
All notable changes to the allocator project are documented in this file.
44

5+
## [1.1.0] - 2024-12-08 🚀
6+
7+
### ✨ New Features
8+
9+
**Interactive Geographic Visualizations:**
10+
- Added `plot_clusters_interactive()` for interactive K-means clustering maps with folium
11+
- Added `plot_route_interactive()` for interactive TSP route visualization with real maps
12+
- Enhanced geographic context with OpenStreetMap tiles and zoom/pan capabilities
13+
- Support for polyline-encoded routes from OSRM and Google Maps APIs
14+
- Professional HTML output suitable for presentations and web sharing
15+
16+
**Enhanced Machine Learning Integration:**
17+
- Introduced `CustomKMeans` class extending sklearn's KMeans with custom distance metrics
18+
- Seamless fallback to pure Python implementation when sklearn unavailable
19+
- Optimized performance while maintaining compatibility with haversine, OSRM, and Google Maps distances
20+
- Improved convergence detection and reproducibility with random_state support
21+
22+
**Dependency Management Improvements:**
23+
- Reorganized optional dependencies into logical groups: `algorithms`, `geo`, `dev`, `test`, `docs`
24+
- Configured deptry for proper dependency validation with PEP 621 support
25+
- Enhanced optional dependency handling with clear error messages
26+
- Streamlined installation with `pip install 'allocator[geo]'` for mapping features
27+
28+
### 🔧 Code Quality & Performance
29+
30+
**Linting & Standards:**
31+
- Fixed all ruff linting errors across entire codebase (58+ issues resolved)
32+
- Enhanced code style consistency with proper whitespace handling
33+
- Added `strict=` parameters to `zip()` calls for safety
34+
- Improved variable naming and removed unused assignments
35+
36+
**Testing & Reliability:**
37+
- Maintained 100% test coverage with 72 passing tests
38+
- Enhanced K-means reproducibility testing for sklearn integration
39+
- Improved test robustness for label permutation handling
40+
- Validated compatibility across Python 3.11, 3.12, and 3.13
41+
42+
**Documentation & Examples:**
43+
- Added comprehensive interactive visualization demo script
44+
- Enhanced example scripts with proper error handling
45+
- Improved docstring quality and type annotations
46+
- Created professional HTML output examples for demos
47+
48+
### 🛠️ Technical Improvements
49+
50+
**Algorithm Optimizations:**
51+
- Hybrid sklearn/custom K-means approach for best of both worlds
52+
- Maintained geographic accuracy while leveraging sklearn optimizations
53+
- Enhanced distance matrix calculations with vectorized operations
54+
- Improved memory usage for large geographic datasets
55+
56+
**Infrastructure:**
57+
- Enhanced CI/CD pipeline with automated quality checks
58+
- Improved build process with uv and modern packaging
59+
- Better dependency conflict resolution
60+
- Streamlined release process with comprehensive testing
61+
62+
### 📦 Installation & Compatibility
63+
64+
**New Optional Groups:**
65+
```bash
66+
pip install 'allocator[algorithms]' # scikit-learn for ML algorithms
67+
pip install 'allocator[geo]' # folium + polyline for interactive maps
68+
pip install 'allocator[all]' # all optional features
69+
```
70+
71+
**Maintained Compatibility:**
72+
- All existing APIs remain unchanged
73+
- No breaking changes for current users
74+
- Backward compatible with v1.0.0 usage patterns
75+
76+
### 🐛 Bug Fixes
77+
78+
- Resolved dependency conflicts in development environment
79+
- Fixed inconsistent K-means results between implementations
80+
- Improved error handling for edge cases in clustering
81+
- Enhanced stability for large geographic datasets
82+
83+
---
84+
585
## [1.0.0] - 2024-10-09 🎉
686

787
### 🚀 Major Release - Complete Modernization

allocator/core/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,15 @@
11
"""Core algorithms for clustering and optimization."""
2+
3+
from .algorithms import (
4+
CustomKMeans,
5+
calculate_cluster_statistics,
6+
kmeans_cluster,
7+
sort_by_distance_assignment,
8+
)
9+
10+
__all__ = [
11+
"CustomKMeans",
12+
"calculate_cluster_statistics",
13+
"kmeans_cluster",
14+
"sort_by_distance_assignment",
15+
]

allocator/core/algorithms.py

Lines changed: 147 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@
1010

1111
from ..distances import get_distance_matrix
1212

13+
try:
14+
from sklearn.cluster import KMeans
15+
from sklearn.utils.validation import check_array
16+
HAS_SKLEARN = True
17+
except ImportError:
18+
HAS_SKLEARN = False
19+
1320

1421
def initialize_centroids(points: np.ndarray, k: int, random_state: int | None = None) -> np.ndarray:
1522
"""
@@ -55,6 +62,98 @@ def move_centroids(points: np.ndarray, closest: np.ndarray, centroids: np.ndarra
5562
return np.array(new_centroids)
5663

5764

65+
class CustomKMeans(KMeans if HAS_SKLEARN else object):
66+
"""
67+
Custom K-means implementation that supports geographic distance metrics.
68+
69+
This class extends sklearn's KMeans to work with custom distance functions
70+
including haversine, OSRM, and Google Maps API distances.
71+
"""
72+
73+
def __init__(self, n_clusters=8, distance_method="euclidean", max_iter=300, random_state=None, **distance_kwargs):
74+
if HAS_SKLEARN:
75+
# Initialize sklearn KMeans with all parameters
76+
super().__init__(n_clusters=n_clusters, max_iter=max_iter, random_state=random_state)
77+
self.distance_method = distance_method
78+
self.distance_kwargs = distance_kwargs
79+
self.n_clusters = n_clusters
80+
81+
def _transform(self, X):
82+
"""Override sklearn's distance calculation to use custom metrics."""
83+
if not HAS_SKLEARN:
84+
raise ImportError("sklearn is required for CustomKMeans. Install with: pip install 'allocator[algorithms]'")
85+
86+
# Use our custom distance factory instead of sklearn's euclidean
87+
distances = get_distance_matrix(X, self.cluster_centers_,
88+
method=self.distance_method,
89+
**self.distance_kwargs)
90+
return distances
91+
92+
def _update_centroids(self, X, labels):
93+
"""Update centroids using geographic mean for custom distances."""
94+
new_centroids = []
95+
for k in range(self.n_clusters):
96+
mask = labels == k
97+
if np.any(mask):
98+
# For geographic data, use simple mean of coordinates
99+
# This works well for most geographic clustering tasks
100+
cluster_points = X[mask]
101+
centroid = np.mean(cluster_points, axis=0)
102+
new_centroids.append(centroid)
103+
else:
104+
# Keep old centroid if cluster is empty
105+
new_centroids.append(self.cluster_centers_[k])
106+
return np.array(new_centroids)
107+
108+
def fit(self, X, y=None, sample_weight=None):
109+
"""Fit the k-means clustering with custom distance metric."""
110+
if not HAS_SKLEARN:
111+
# Fallback to original implementation if sklearn not available
112+
return self._fit_custom_implementation(X)
113+
114+
X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32])
115+
116+
# Initialize using sklearn's initialization logic
117+
super().fit(X)
118+
119+
# Now run our custom iterations
120+
for iteration in range(self.max_iter):
121+
# Calculate distances using custom metric
122+
distances = get_distance_matrix(X, self.cluster_centers_,
123+
method=self.distance_method,
124+
**self.distance_kwargs)
125+
126+
# Assign points to nearest centroids
127+
labels = np.argmin(distances, axis=1)
128+
129+
# Update centroids
130+
new_centroids = self._update_centroids(X, labels)
131+
132+
# Check convergence
133+
if np.allclose(self.cluster_centers_, new_centroids, rtol=1e-4):
134+
self.cluster_centers_ = new_centroids
135+
self.labels_ = labels
136+
self.n_iter_ = iteration + 1
137+
break
138+
139+
self.cluster_centers_ = new_centroids
140+
else:
141+
self.labels_ = labels
142+
self.n_iter_ = self.max_iter
143+
144+
return self
145+
146+
def _fit_custom_implementation(self, X):
147+
"""Fallback to original implementation when sklearn is not available."""
148+
result = _kmeans_cluster_original(X, self.n_clusters,
149+
distance_method=self.distance_method,
150+
**self.distance_kwargs)
151+
self.cluster_centers_ = result["centroids"]
152+
self.labels_ = result["labels"]
153+
self.n_iter_ = result["iterations"]
154+
return self
155+
156+
58157
def kmeans_cluster(
59158
data: pd.DataFrame | np.ndarray,
60159
n_clusters: int,
@@ -64,10 +163,55 @@ def kmeans_cluster(
64163
**distance_kwargs,
65164
) -> dict:
66165
"""
67-
Pure K-means clustering implementation.
166+
K-means clustering with support for custom distance metrics.
167+
168+
This function provides a unified interface that uses sklearn when available
169+
and falls back to the original implementation otherwise.
170+
"""
171+
# Convert DataFrame to numpy array if needed
172+
if isinstance(data, pd.DataFrame):
173+
if "longitude" in data.columns and "latitude" in data.columns:
174+
X = data[["longitude", "latitude"]].values
175+
else:
176+
raise ValueError("DataFrame must contain 'longitude' and 'latitude' columns")
177+
else:
178+
X = np.asarray(data)
179+
180+
# Use sklearn-based implementation if available
181+
if HAS_SKLEARN and distance_method in ["euclidean", "haversine", "osrm", "google"]:
182+
kmeans = CustomKMeans(
183+
n_clusters=n_clusters,
184+
distance_method=distance_method,
185+
max_iter=max_iter,
186+
random_state=random_state,
187+
**distance_kwargs
188+
)
189+
kmeans.fit(X)
190+
191+
return {
192+
"labels": kmeans.labels_,
193+
"centroids": kmeans.cluster_centers_,
194+
"iterations": kmeans.n_iter_,
195+
"converged": kmeans.n_iter_ < max_iter,
196+
}
197+
198+
# Fall back to original implementation
199+
return _kmeans_cluster_original(X, n_clusters, distance_method, max_iter, random_state, **distance_kwargs)
200+
201+
202+
def _kmeans_cluster_original(
203+
data: np.ndarray,
204+
n_clusters: int,
205+
distance_method: str = "euclidean",
206+
max_iter: int = 300,
207+
random_state: int | None = None,
208+
**distance_kwargs,
209+
) -> dict:
210+
"""
211+
Original pure K-means clustering implementation (fallback).
68212
69213
Args:
70-
data: Input data as DataFrame with longitude/latitude or numpy array [n, 2]
214+
data: Input data as numpy array [n, 2]
71215
n_clusters: Number of clusters
72216
distance_method: Distance calculation method
73217
max_iter: Maximum iterations
@@ -77,14 +221,7 @@ def kmeans_cluster(
77221
Returns:
78222
Dictionary with 'labels', 'centroids', 'iterations', 'converged'
79223
"""
80-
# Convert DataFrame to numpy array if needed
81-
if isinstance(data, pd.DataFrame):
82-
if "longitude" in data.columns and "latitude" in data.columns:
83-
X = data[["longitude", "latitude"]].values
84-
else:
85-
raise ValueError("DataFrame must contain 'longitude' and 'latitude' columns")
86-
else:
87-
X = np.asarray(data)
224+
X = data
88225

89226
# Initialize centroids
90227
centroids = initialize_centroids(X, n_clusters, random_state)

allocator/viz/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,19 @@
33
from .plotting import (
44
plot_assignments,
55
plot_clusters,
6+
plot_clusters_interactive,
67
plot_clusters_on_axis,
78
plot_comparison,
89
plot_route,
10+
plot_route_interactive,
911
)
1012

1113
__all__ = [
1214
"plot_assignments",
1315
"plot_clusters",
16+
"plot_clusters_interactive",
1417
"plot_clusters_on_axis",
1518
"plot_comparison",
1619
"plot_route",
20+
"plot_route_interactive",
1721
]

0 commit comments

Comments
 (0)