lint

soodoku · soodoku · commit 7a1f1ace4e24 · 2026-04-12T13:04:38.000-07:00
diff --git a/allocator/core/itinerary.py b/allocator/core/itinerary.py
@@ -10,8 +10,6 @@
 - kmeans_tsp: K-means clustering with TSP optimization (count-driven)
 """
 
-from __future__ import annotations
-
 import numpy as np
 from ortools.constraint_solver import pywrapcp, routing_enums_pb2
 
diff --git a/allocator/core/routing.py b/allocator/core/routing.py
@@ -2,8 +2,6 @@
 Pure routing/TSP algorithm implementations.
 """
 
-from __future__ import annotations
-
 import googlemaps
 import networkx as nx
 import numpy as np
diff --git a/allocator/distances/euclidean.py b/allocator/distances/euclidean.py
@@ -2,8 +2,6 @@
 Euclidean distance calculations for geographic coordinates.
 """
 
-from __future__ import annotations
-
 import numpy as np
 import utm
 
diff --git a/allocator/distances/external_apis.py b/allocator/distances/external_apis.py
@@ -2,8 +2,6 @@
 External API integrations for distance calculations (OSRM, Google Maps).
 """
 
-from __future__ import annotations
-
 import logging
 import math
 import time
diff --git a/allocator/distances/factory.py b/allocator/distances/factory.py
@@ -2,8 +2,6 @@
 Factory module for distance calculations - main entry point.
 """
 
-from __future__ import annotations
-
 import numpy as np
 
 from .euclidean import euclidean_distance_matrix
diff --git a/allocator/distances/haversine.py b/allocator/distances/haversine.py
@@ -2,8 +2,6 @@
 Haversine distance calculations for geographic coordinates.
 """
 
-from __future__ import annotations
-
 import numpy as np
 from haversine import haversine
 
diff --git a/allocator/io/data_handler.py b/allocator/io/data_handler.py
@@ -2,8 +2,6 @@
 Modern data handling utilities with standardized formats.
 """
 
-from __future__ import annotations
-
 import json
 import logging
 from pathlib import Path
diff --git a/allocator/stats/__init__.py b/allocator/stats/__init__.py
@@ -0,0 +1,14 @@
+"""
+Statistical utilities for clustered/itinerary data analysis.
+
+This module provides functions for:
+- Design effect computation (variance inflation due to clustering)
+- Cluster-robust standard error estimation
+"""
+
+from .design_effect import compute_cluster_robust_se, compute_design_effect
+
+__all__ = [
+    "compute_cluster_robust_se",
+    "compute_design_effect",
+]
diff --git a/allocator/stats/design_effect.py b/allocator/stats/design_effect.py
@@ -0,0 +1,128 @@
+"""
+Design effect and cluster-robust standard error computations.
+
+Design effects measure variance inflation due to clustering compared to
+simple random sampling. These functions are useful for:
+- Survey sampling with clustered designs
+- Evaluating how itinerary assignment affects variance estimation
+- Determining appropriate standard errors for clustered data
+"""
+
+import numpy as np
+
+
+def compute_design_effect(
+    outcomes: np.ndarray,
+    cluster_ids: np.ndarray,
+) -> float:
+    """
+    Compute design effect from clustered data.
+
+    Design effect = Var(cluster) / Var(SRS)
+
+    Values > 1 mean clustering inflates variance compared to simple random sampling.
+    This is important when units within clusters are correlated.
+
+    Args:
+        outcomes: Array of outcome values for each unit
+        cluster_ids: Array of cluster assignments for each unit
+
+    Returns:
+        Design effect (ratio of cluster variance to SRS variance).
+        Returns 1.0 for edge cases (single cluster, constant outcomes).
+
+    Example:
+        >>> import numpy as np
+        >>> outcomes = np.array([1.0, 1.1, 5.0, 5.1, 9.0, 9.1])
+        >>> cluster_ids = np.array([0, 0, 1, 1, 2, 2])
+        >>> deff = compute_design_effect(outcomes, cluster_ids)
+        >>> deff > 1.0  # High within-cluster correlation
+        True
+    """
+    n = len(outcomes)
+    if n <= 1:
+        return 1.0
+
+    srs_var = np.var(outcomes, ddof=1) / n
+    if srs_var == 0:
+        return 1.0
+
+    unique_clusters = np.unique(cluster_ids)
+    n_clusters = len(unique_clusters)
+
+    if n_clusters <= 1:
+        return 1.0
+
+    cluster_means = []
+    cluster_sizes = []
+    for cid in unique_clusters:
+        mask = cluster_ids == cid
+        cluster_means.append(outcomes[mask].mean())
+        cluster_sizes.append(mask.sum())
+
+    cluster_means = np.array(cluster_means)
+    cluster_sizes = np.array(cluster_sizes)
+
+    grand_mean = outcomes.mean()
+    between_var = np.sum(cluster_sizes * (cluster_means - grand_mean) ** 2) / (n_clusters - 1)
+
+    avg_cluster_size = n / n_clusters
+    rho = 0.0
+    total_var = np.var(outcomes, ddof=1)
+    if total_var > 0 and avg_cluster_size > 1:
+        rho = max(0, (between_var / total_var - 1 / avg_cluster_size) / (1 - 1 / avg_cluster_size))
+        rho = min(rho, 1.0)
+
+    deff = 1 + (avg_cluster_size - 1) * rho
+
+    return max(deff, 1.0)
+
+
+def compute_cluster_robust_se(
+    outcomes: np.ndarray,
+    cluster_ids: np.ndarray,
+) -> float:
+    """
+    Compute cluster-robust standard error of the mean.
+
+    When observations within clusters are correlated, the naive standard error
+    (assuming independence) underestimates the true sampling variability.
+    This function computes a cluster-robust SE that accounts for within-cluster
+    correlation.
+
+    Args:
+        outcomes: Array of outcome values
+        cluster_ids: Array of cluster assignments
+
+    Returns:
+        Cluster-robust standard error of the mean.
+        Returns naive SE for single cluster, 0.0 for single observation.
+
+    Example:
+        >>> import numpy as np
+        >>> outcomes = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+        >>> cluster_ids = np.array([0, 0, 1, 1, 2, 2])
+        >>> se = compute_cluster_robust_se(outcomes, cluster_ids)
+        >>> se > 0
+        True
+    """
+    n = len(outcomes)
+    if n <= 1:
+        return 0.0
+
+    unique_clusters = np.unique(cluster_ids)
+    n_clusters = len(unique_clusters)
+
+    if n_clusters <= 1:
+        return np.std(outcomes, ddof=1) / np.sqrt(n)
+
+    cluster_means = []
+    for cid in unique_clusters:
+        mask = cluster_ids == cid
+        cluster_means.append(outcomes[mask].mean())
+
+    cluster_means = np.array(cluster_means)
+    between_cluster_var = np.var(cluster_means, ddof=1)
+    cluster_se = np.sqrt(between_cluster_var / n_clusters)
+
+    return cluster_se
diff --git a/allocator/utils.py b/allocator/utils.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import pandas as pd
 
 

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-from __future__ import annotations`
`2`		`-`
`3`	`1`	`import pandas as pd`
`4`	`2`
`5`	`3`