|
| 1 | +""" |
| 2 | +Design effect and cluster-robust standard error computations. |
| 3 | +
|
| 4 | +Design effects measure variance inflation due to clustering compared to |
| 5 | +simple random sampling. These functions are useful for: |
| 6 | +- Survey sampling with clustered designs |
| 7 | +- Evaluating how itinerary assignment affects variance estimation |
| 8 | +- Determining appropriate standard errors for clustered data |
| 9 | +""" |
| 10 | + |
| 11 | +import numpy as np |
| 12 | + |
| 13 | + |
| 14 | +def compute_design_effect( |
| 15 | + outcomes: np.ndarray, |
| 16 | + cluster_ids: np.ndarray, |
| 17 | +) -> float: |
| 18 | + """ |
| 19 | + Compute design effect from clustered data. |
| 20 | +
|
| 21 | + Design effect = Var(cluster) / Var(SRS) |
| 22 | +
|
| 23 | + Values > 1 mean clustering inflates variance compared to simple random sampling. |
| 24 | + This is important when units within clusters are correlated. |
| 25 | +
|
| 26 | + Args: |
| 27 | + outcomes: Array of outcome values for each unit |
| 28 | + cluster_ids: Array of cluster assignments for each unit |
| 29 | +
|
| 30 | + Returns: |
| 31 | + Design effect (ratio of cluster variance to SRS variance). |
| 32 | + Returns 1.0 for edge cases (single cluster, constant outcomes). |
| 33 | +
|
| 34 | + Example: |
| 35 | + >>> import numpy as np |
| 36 | + >>> outcomes = np.array([1.0, 1.1, 5.0, 5.1, 9.0, 9.1]) |
| 37 | + >>> cluster_ids = np.array([0, 0, 1, 1, 2, 2]) |
| 38 | + >>> deff = compute_design_effect(outcomes, cluster_ids) |
| 39 | + >>> deff > 1.0 # High within-cluster correlation |
| 40 | + True |
| 41 | + """ |
| 42 | + n = len(outcomes) |
| 43 | + if n <= 1: |
| 44 | + return 1.0 |
| 45 | + |
| 46 | + srs_var = np.var(outcomes, ddof=1) / n |
| 47 | + if srs_var == 0: |
| 48 | + return 1.0 |
| 49 | + |
| 50 | + unique_clusters = np.unique(cluster_ids) |
| 51 | + n_clusters = len(unique_clusters) |
| 52 | + |
| 53 | + if n_clusters <= 1: |
| 54 | + return 1.0 |
| 55 | + |
| 56 | + cluster_means = [] |
| 57 | + cluster_sizes = [] |
| 58 | + for cid in unique_clusters: |
| 59 | + mask = cluster_ids == cid |
| 60 | + cluster_means.append(outcomes[mask].mean()) |
| 61 | + cluster_sizes.append(mask.sum()) |
| 62 | + |
| 63 | + cluster_means = np.array(cluster_means) |
| 64 | + cluster_sizes = np.array(cluster_sizes) |
| 65 | + |
| 66 | + grand_mean = outcomes.mean() |
| 67 | + between_var = np.sum(cluster_sizes * (cluster_means - grand_mean) ** 2) / (n_clusters - 1) |
| 68 | + |
| 69 | + avg_cluster_size = n / n_clusters |
| 70 | + rho = 0.0 |
| 71 | + total_var = np.var(outcomes, ddof=1) |
| 72 | + if total_var > 0 and avg_cluster_size > 1: |
| 73 | + rho = max(0, (between_var / total_var - 1 / avg_cluster_size) / (1 - 1 / avg_cluster_size)) |
| 74 | + rho = min(rho, 1.0) |
| 75 | + |
| 76 | + deff = 1 + (avg_cluster_size - 1) * rho |
| 77 | + |
| 78 | + return max(deff, 1.0) |
| 79 | + |
| 80 | + |
| 81 | +def compute_cluster_robust_se( |
| 82 | + outcomes: np.ndarray, |
| 83 | + cluster_ids: np.ndarray, |
| 84 | +) -> float: |
| 85 | + """ |
| 86 | + Compute cluster-robust standard error of the mean. |
| 87 | +
|
| 88 | + When observations within clusters are correlated, the naive standard error |
| 89 | + (assuming independence) underestimates the true sampling variability. |
| 90 | + This function computes a cluster-robust SE that accounts for within-cluster |
| 91 | + correlation. |
| 92 | +
|
| 93 | + Args: |
| 94 | + outcomes: Array of outcome values |
| 95 | + cluster_ids: Array of cluster assignments |
| 96 | +
|
| 97 | + Returns: |
| 98 | + Cluster-robust standard error of the mean. |
| 99 | + Returns naive SE for single cluster, 0.0 for single observation. |
| 100 | +
|
| 101 | + Example: |
| 102 | + >>> import numpy as np |
| 103 | + >>> outcomes = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) |
| 104 | + >>> cluster_ids = np.array([0, 0, 1, 1, 2, 2]) |
| 105 | + >>> se = compute_cluster_robust_se(outcomes, cluster_ids) |
| 106 | + >>> se > 0 |
| 107 | + True |
| 108 | + """ |
| 109 | + n = len(outcomes) |
| 110 | + if n <= 1: |
| 111 | + return 0.0 |
| 112 | + |
| 113 | + unique_clusters = np.unique(cluster_ids) |
| 114 | + n_clusters = len(unique_clusters) |
| 115 | + |
| 116 | + if n_clusters <= 1: |
| 117 | + return np.std(outcomes, ddof=1) / np.sqrt(n) |
| 118 | + |
| 119 | + cluster_means = [] |
| 120 | + for cid in unique_clusters: |
| 121 | + mask = cluster_ids == cid |
| 122 | + cluster_means.append(outcomes[mask].mean()) |
| 123 | + |
| 124 | + cluster_means = np.array(cluster_means) |
| 125 | + between_cluster_var = np.var(cluster_means, ddof=1) |
| 126 | + cluster_se = np.sqrt(between_cluster_var / n_clusters) |
| 127 | + |
| 128 | + return cluster_se |
0 commit comments