Skip to content

Commit 7a1f1ac

Browse files
committed
lint
1 parent 0019380 commit 7a1f1ac

10 files changed

Lines changed: 142 additions & 16 deletions

File tree

allocator/core/itinerary.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
- kmeans_tsp: K-means clustering with TSP optimization (count-driven)
1111
"""
1212

13-
from __future__ import annotations
14-
1513
import numpy as np
1614
from ortools.constraint_solver import pywrapcp, routing_enums_pb2
1715

allocator/core/routing.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
Pure routing/TSP algorithm implementations.
33
"""
44

5-
from __future__ import annotations
6-
75
import googlemaps
86
import networkx as nx
97
import numpy as np

allocator/distances/euclidean.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
Euclidean distance calculations for geographic coordinates.
33
"""
44

5-
from __future__ import annotations
6-
75
import numpy as np
86
import utm
97

allocator/distances/external_apis.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
External API integrations for distance calculations (OSRM, Google Maps).
33
"""
44

5-
from __future__ import annotations
6-
75
import logging
86
import math
97
import time

allocator/distances/factory.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
Factory module for distance calculations - main entry point.
33
"""
44

5-
from __future__ import annotations
6-
75
import numpy as np
86

97
from .euclidean import euclidean_distance_matrix

allocator/distances/haversine.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
Haversine distance calculations for geographic coordinates.
33
"""
44

5-
from __future__ import annotations
6-
75
import numpy as np
86
from haversine import haversine
97

allocator/io/data_handler.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
Modern data handling utilities with standardized formats.
33
"""
44

5-
from __future__ import annotations
6-
75
import json
86
import logging
97
from pathlib import Path

allocator/stats/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""
2+
Statistical utilities for clustered/itinerary data analysis.
3+
4+
This module provides functions for:
5+
- Design effect computation (variance inflation due to clustering)
6+
- Cluster-robust standard error estimation
7+
"""
8+
9+
from .design_effect import compute_cluster_robust_se, compute_design_effect
10+
11+
__all__ = [
12+
"compute_cluster_robust_se",
13+
"compute_design_effect",
14+
]

allocator/stats/design_effect.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
"""
2+
Design effect and cluster-robust standard error computations.
3+
4+
Design effects measure variance inflation due to clustering compared to
5+
simple random sampling. These functions are useful for:
6+
- Survey sampling with clustered designs
7+
- Evaluating how itinerary assignment affects variance estimation
8+
- Determining appropriate standard errors for clustered data
9+
"""
10+
11+
import numpy as np
12+
13+
14+
def compute_design_effect(
15+
outcomes: np.ndarray,
16+
cluster_ids: np.ndarray,
17+
) -> float:
18+
"""
19+
Compute design effect from clustered data.
20+
21+
Design effect = Var(cluster) / Var(SRS)
22+
23+
Values > 1 mean clustering inflates variance compared to simple random sampling.
24+
This is important when units within clusters are correlated.
25+
26+
Args:
27+
outcomes: Array of outcome values for each unit
28+
cluster_ids: Array of cluster assignments for each unit
29+
30+
Returns:
31+
Design effect (ratio of cluster variance to SRS variance).
32+
Returns 1.0 for edge cases (single cluster, constant outcomes).
33+
34+
Example:
35+
>>> import numpy as np
36+
>>> outcomes = np.array([1.0, 1.1, 5.0, 5.1, 9.0, 9.1])
37+
>>> cluster_ids = np.array([0, 0, 1, 1, 2, 2])
38+
>>> deff = compute_design_effect(outcomes, cluster_ids)
39+
>>> deff > 1.0 # High within-cluster correlation
40+
True
41+
"""
42+
n = len(outcomes)
43+
if n <= 1:
44+
return 1.0
45+
46+
srs_var = np.var(outcomes, ddof=1) / n
47+
if srs_var == 0:
48+
return 1.0
49+
50+
unique_clusters = np.unique(cluster_ids)
51+
n_clusters = len(unique_clusters)
52+
53+
if n_clusters <= 1:
54+
return 1.0
55+
56+
cluster_means = []
57+
cluster_sizes = []
58+
for cid in unique_clusters:
59+
mask = cluster_ids == cid
60+
cluster_means.append(outcomes[mask].mean())
61+
cluster_sizes.append(mask.sum())
62+
63+
cluster_means = np.array(cluster_means)
64+
cluster_sizes = np.array(cluster_sizes)
65+
66+
grand_mean = outcomes.mean()
67+
between_var = np.sum(cluster_sizes * (cluster_means - grand_mean) ** 2) / (n_clusters - 1)
68+
69+
avg_cluster_size = n / n_clusters
70+
rho = 0.0
71+
total_var = np.var(outcomes, ddof=1)
72+
if total_var > 0 and avg_cluster_size > 1:
73+
rho = max(0, (between_var / total_var - 1 / avg_cluster_size) / (1 - 1 / avg_cluster_size))
74+
rho = min(rho, 1.0)
75+
76+
deff = 1 + (avg_cluster_size - 1) * rho
77+
78+
return max(deff, 1.0)
79+
80+
81+
def compute_cluster_robust_se(
82+
outcomes: np.ndarray,
83+
cluster_ids: np.ndarray,
84+
) -> float:
85+
"""
86+
Compute cluster-robust standard error of the mean.
87+
88+
When observations within clusters are correlated, the naive standard error
89+
(assuming independence) underestimates the true sampling variability.
90+
This function computes a cluster-robust SE that accounts for within-cluster
91+
correlation.
92+
93+
Args:
94+
outcomes: Array of outcome values
95+
cluster_ids: Array of cluster assignments
96+
97+
Returns:
98+
Cluster-robust standard error of the mean.
99+
Returns naive SE for single cluster, 0.0 for single observation.
100+
101+
Example:
102+
>>> import numpy as np
103+
>>> outcomes = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
104+
>>> cluster_ids = np.array([0, 0, 1, 1, 2, 2])
105+
>>> se = compute_cluster_robust_se(outcomes, cluster_ids)
106+
>>> se > 0
107+
True
108+
"""
109+
n = len(outcomes)
110+
if n <= 1:
111+
return 0.0
112+
113+
unique_clusters = np.unique(cluster_ids)
114+
n_clusters = len(unique_clusters)
115+
116+
if n_clusters <= 1:
117+
return np.std(outcomes, ddof=1) / np.sqrt(n)
118+
119+
cluster_means = []
120+
for cid in unique_clusters:
121+
mask = cluster_ids == cid
122+
cluster_means.append(outcomes[mask].mean())
123+
124+
cluster_means = np.array(cluster_means)
125+
between_cluster_var = np.var(cluster_means, ddof=1)
126+
cluster_se = np.sqrt(between_cluster_var / n_clusters)
127+
128+
return cluster_se

allocator/utils.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from __future__ import annotations
2-
31
import pandas as pd
42

53

0 commit comments

Comments
 (0)