Skip to content

Commit 71cfc8d

Browse files
authored
Enhance LinearSplineLogisticRegression and LossGradHess to support sample weights. Update fitting methods to incorporate sample weights for loss calculation, gradient, and Hessian. Introduce weighted quantile computation for knot fitting. Modify tests to validate new sample weight functionality in subsampling methods. (#14)
1 parent 4391e91 commit 71cfc8d

5 files changed

Lines changed: 411 additions & 59 deletions

File tree

src/splinator/estimators.py

Lines changed: 78 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ class MinimizationMethod(Enum):
2424

2525

2626
class LossGradHess:
27-
def __init__(self, X, y, alpha, intercept):
28-
# type: (np.ndarray, np.ndarray, float, bool) -> None
27+
def __init__(self, X, y, alpha, intercept, sample_weight=None):
28+
# type: (np.ndarray, np.ndarray, float, bool, Optional[np.ndarray]) -> None
2929
"""
3030
In the generation of design matrix, if intercept option is True, the first column of design matrix is of 1's,
3131
which means that the first coefficient corresponds to the intercept term. This setup is a little different
@@ -37,13 +37,17 @@ def __init__(self, X, y, alpha, intercept):
3737
self.X = X
3838
self.alpha = alpha
3939
self.intercept = intercept
40+
if sample_weight is None:
41+
self.sample_weight = np.ones(len(y))
42+
else:
43+
self.sample_weight = np.asarray(sample_weight)
44+
self.weight_sum = np.sum(self.sample_weight)
4045

4146
def loss(self, coefs):
4247
# type: (np.ndarray) -> np.ndarray
4348
yz = self.y * np.dot(self.X, coefs)
4449
# P(label= 1 or -1 |X) = 1 / (1+exp(-yz))
45-
# Log Likelihood = Sum over log ( 1/(1 + exp(-yz)) )
46-
loss_val = -np.sum(log_expit(yz))
50+
loss_val = -np.sum(self.sample_weight * log_expit(yz))
4751
if self.intercept:
4852
loss_val += 0.5 * self.alpha * np.dot(coefs[1:], coefs[1:])
4953
else:
@@ -58,8 +62,7 @@ def grad(self, coefs):
5862

5963
# if y = 1, we want z to be close to 1; if y = -1, we want z to be close to 0.
6064
z0 = (z - 1) * self.y
61-
62-
grad = np.dot(self.X.T, z0)
65+
grad = np.dot(self.X.T, self.sample_weight * z0)
6366

6467
if self.intercept:
6568
grad[1:] += self.alpha * coefs[1:]
@@ -74,19 +77,14 @@ def hess(self, coefs):
7477
Compute the Hessian of the logistic loss.
7578
7679
The Hessian of logistic regression is: H = X.T @ diag(w) @ X + alpha * I
77-
where w = p * (1 - p) and p = sigmoid(X @ coefs).
80+
where w = sample_weight * p * (1 - p) and p = sigmoid(X @ coefs).
7881
7982
This is used by trust-constr for faster convergence (Newton-like steps).
8083
"""
8184
z = np.dot(self.X, coefs)
8285
p = expit(z)
83-
# Weights for the Hessian: p * (1 - p)
84-
# This is always positive, making the Hessian positive semi-definite
85-
weights = p * (1 - p)
86-
87-
# H = X.T @ diag(weights) @ X
88-
# Efficient computation: (X.T * weights) @ X
89-
H = np.dot(self.X.T * weights, self.X)
86+
hess_weights = self.sample_weight * p * (1 - p)
87+
H = np.dot(self.X.T * hess_weights, self.X)
9088

9189
# Add regularization term
9290
if self.intercept:
@@ -259,8 +257,8 @@ def __init__(
259257
self.random_state = random_state
260258
self.verbose = verbose
261259

262-
def _fit(self, X, y, initial_guess=None):
263-
# type: (pd.DataFrame, pd.Series, Optional[np.ndarray], bool) -> None
260+
def _fit(self, X, y, sample_weight=None, initial_guess=None):
261+
# type: (pd.DataFrame, pd.Series, Optional[np.ndarray], Optional[np.ndarray]) -> None
264262
constraint = [] # type: Union[Dict[str, Any], List, LinearConstraint]
265263
if self.monotonicity != Monotonicity.none.value:
266264
# This function returns G and h such that G * beta <= 0 is the constraint we want:
@@ -298,7 +296,7 @@ def _fit(self, X, y, initial_guess=None):
298296
else:
299297
x0 = initial_guess
300298

301-
lgh = LossGradHess(design_X, y, 1 / self.C, self.intercept)
299+
lgh = LossGradHess(design_X, y, 1 / self.C, self.intercept, sample_weight)
302300

303301
# Determine whether to use Hessian
304302
# - 'auto': use Hessian only for trust-constr with no monotonicity constraints (3-4x faster)
@@ -344,8 +342,8 @@ def get_additional_columns(self, X):
344342
additional_columns = np.delete(X, self.input_score_column_index, axis=1)
345343
return additional_columns
346344

347-
def _stratified_subsample(self, X, y, n_samples, n_strata=10):
348-
# type: (np.ndarray, np.ndarray, int, int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]
345+
def _stratified_subsample(self, X, y, n_samples, sample_weight=None, n_strata=10):
346+
# type: (np.ndarray, np.ndarray, int, Optional[np.ndarray], int) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], np.ndarray]
349347
"""
350348
Create a stratified subsample based on quantiles of the input scores.
351349
@@ -358,13 +356,15 @@ def _stratified_subsample(self, X, y, n_samples, n_strata=10):
358356
y : array-like of shape (n_samples,)
359357
n_samples : int
360358
Target number of samples in the subsample
359+
sample_weight : array-like of shape (n_samples,), optional
361360
n_strata : int
362361
Number of strata (quantile bins) to use
363362
364363
Returns
365364
-------
366365
X_sub : array-like
367-
y_sub : array-like
366+
y_sub : array-like
367+
weight_sub : array-like or None
368368
indices : array-like
369369
Indices of selected samples
370370
"""
@@ -412,11 +412,12 @@ def _stratified_subsample(self, X, y, n_samples, n_strata=10):
412412
else:
413413
X_sub = X[selected_indices, :]
414414
y_sub = y[selected_indices]
415+
weight_sub = sample_weight[selected_indices] if sample_weight is not None else None
415416

416-
return X_sub, y_sub, selected_indices
417+
return X_sub, y_sub, weight_sub, selected_indices
417418

418-
def _random_subsample(self, X, y, n_samples):
419-
# type: (np.ndarray, np.ndarray, int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]
419+
def _random_subsample(self, X, y, n_samples, sample_weight=None):
420+
# type: (np.ndarray, np.ndarray, int, Optional[np.ndarray]) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], np.ndarray]
420421
"""Simple random subsampling (original behavior)."""
421422
indices = self.random_state_.choice(
422423
np.arange(len(X)), n_samples, replace=False
@@ -425,7 +426,8 @@ def _random_subsample(self, X, y, n_samples):
425426
X_sub, y_sub = X.iloc[indices], y[indices]
426427
else:
427428
X_sub, y_sub = X[indices, :], y[indices]
428-
return X_sub, y_sub, indices
429+
weight_sub = sample_weight[indices] if sample_weight is not None else None
430+
return X_sub, y_sub, weight_sub, indices
429431

430432
def _check_convergence(self, coefs_old, coefs_new):
431433
# type: (np.ndarray, np.ndarray) -> bool
@@ -443,11 +445,28 @@ def _check_convergence(self, coefs_old, coefs_new):
443445

444446
return change < self.early_stopping_tol
445447

446-
def fit(self, X, y):
448+
def fit(self, X, y, sample_weight=None):
447449
# type: (pd.DataFrame, Union[np.ndarray, pd.Series], Optional[np.ndarray]) -> None
448450
"""
449451
Fit the linear spline logistic regression model.
450452
453+
Parameters
454+
----------
455+
X : array-like of shape (n_samples, n_features)
456+
Training data.
457+
y : array-like of shape (n_samples,)
458+
Target values (binary: 0 or 1).
459+
sample_weight : array-like of shape (n_samples,), optional
460+
Individual weights for each sample. If not provided, all samples
461+
are given equal weight.
462+
463+
Returns
464+
-------
465+
self : object
466+
Fitted estimator.
467+
468+
Notes
469+
-----
451470
Supports three fitting modes:
452471
1. Direct fitting (default): Fit on full data
453472
2. Two-stage fitting (legacy): Uses `two_stage_fitting_initial_size`
@@ -478,10 +497,21 @@ def fit(self, X, y):
478497
)
479498
y = y[:, 0]
480499

500+
# Validate sample_weight
501+
if sample_weight is not None:
502+
sample_weight = np.asarray(sample_weight)
503+
if sample_weight.shape[0] != X.shape[0]:
504+
raise ValueError(
505+
f"sample_weight has {sample_weight.shape[0]} samples, "
506+
f"but X has {X.shape[0]} samples"
507+
)
508+
if np.any(sample_weight < 0):
509+
raise ValueError("sample_weight must be non-negative")
510+
481511
if self.n_knots and self.knots is None:
482512
# only n_knots given so we create knots
483513
self.n_knots_ = min([self.n_knots, X.shape[0]])
484-
self.knots_ = _fit_knots(self.get_input_scores(X), self.n_knots_)
514+
self.knots_ = _fit_knots(self.get_input_scores(X), self.n_knots_, sample_weight)
485515
elif self.knots is not None and self.n_knots is None:
486516
# knots are given so we just take them
487517
self.knots_ = np.array(self.knots)
@@ -496,13 +526,13 @@ def fit(self, X, y):
496526
# Determine fitting mode
497527
if self.progressive_fitting_fractions is not None:
498528
# Mode 3: Progressive fitting (recommended)
499-
self._progressive_fit(X, y)
529+
self._progressive_fit(X, y, sample_weight)
500530
elif self.two_stage_fitting_initial_size is not None:
501531
# Mode 2: Legacy two-stage fitting
502-
self._two_stage_fit(X, y)
532+
self._two_stage_fit(X, y, sample_weight)
503533
else:
504534
# Mode 1: Direct fitting on full data
505-
self._fit(X, y, initial_guess=None)
535+
self._fit(X, y, sample_weight=sample_weight, initial_guess=None)
506536
self.fitting_history_.append({
507537
'stage': 0,
508538
'n_samples': n_samples,
@@ -513,8 +543,8 @@ def fit(self, X, y):
513543

514544
return self
515545

516-
def _two_stage_fit(self, X, y):
517-
# type: (np.ndarray, np.ndarray) -> None
546+
def _two_stage_fit(self, X, y, sample_weight=None):
547+
# type: (np.ndarray, np.ndarray, Optional[np.ndarray]) -> None
518548
"""Legacy two-stage fitting for backward compatibility."""
519549
n_samples = X.shape[0]
520550

@@ -532,15 +562,15 @@ def _two_stage_fit(self, X, y):
532562

533563
# Stage 1: Fit on subsample
534564
if self.stratified_sampling:
535-
X_sub, y_sub, _ = self._stratified_subsample(
536-
X, y, self.two_stage_fitting_initial_size
565+
X_sub, y_sub, weight_sub, _ = self._stratified_subsample(
566+
X, y, self.two_stage_fitting_initial_size, sample_weight
537567
)
538568
else:
539-
X_sub, y_sub, _ = self._random_subsample(
540-
X, y, self.two_stage_fitting_initial_size
569+
X_sub, y_sub, weight_sub, _ = self._random_subsample(
570+
X, y, self.two_stage_fitting_initial_size, sample_weight
541571
)
542572

543-
self._fit(X_sub, y_sub, initial_guess=None)
573+
self._fit(X_sub, y_sub, sample_weight=weight_sub, initial_guess=None)
544574
self.fitting_history_.append({
545575
'stage': 0,
546576
'n_samples': self.two_stage_fitting_initial_size,
@@ -555,7 +585,7 @@ def _two_stage_fit(self, X, y):
555585

556586
# Stage 2: Fit on full data with warm start
557587
coefs_stage1 = self.coefficients_.copy()
558-
self._fit(X, y, initial_guess=coefs_stage1)
588+
self._fit(X, y, sample_weight=sample_weight, initial_guess=coefs_stage1)
559589
self.fitting_history_.append({
560590
'stage': 1,
561591
'n_samples': n_samples,
@@ -567,8 +597,8 @@ def _two_stage_fit(self, X, y):
567597
if self.verbose:
568598
print(f"Stage 2: {n_samples} samples, {self.result_.nit} iterations")
569599

570-
def _progressive_fit(self, X, y):
571-
# type: (np.ndarray, np.ndarray) -> None
600+
def _progressive_fit(self, X, y, sample_weight=None):
601+
# type: (np.ndarray, np.ndarray, Optional[np.ndarray]) -> None
572602
"""
573603
Progressive fitting with gradual sample increase.
574604
@@ -590,12 +620,12 @@ def _progressive_fit(self, X, y):
590620

591621
for stage, frac in enumerate(fractions):
592622
n_stage_samples = int(n_samples * frac)
593-
n_stage_samples = max(n_stage_samples, self.knots_.shape[0] + 10) # Ensure enough samples
623+
n_stage_samples = max(n_stage_samples, self.knots_.shape[0] + 10)
594624
n_stage_samples = min(n_stage_samples, n_samples)
595625

596626
if frac >= 1.0:
597627
# Use full data for final stage
598-
X_stage, y_stage = X, y
628+
X_stage, y_stage, weight_stage = X, y, sample_weight
599629
else:
600630
# Warn if subsample too small for knots
601631
samples_per_knot = n_stage_samples / (self.knots_.shape[0] + 1)
@@ -604,15 +634,15 @@ def _progressive_fit(self, X, y):
604634

605635
# Subsample for intermediate stages
606636
if self.stratified_sampling:
607-
X_stage, y_stage, _ = self._stratified_subsample(
608-
X, y, n_stage_samples
637+
X_stage, y_stage, weight_stage, _ = self._stratified_subsample(
638+
X, y, n_stage_samples, sample_weight
609639
)
610640
else:
611-
X_stage, y_stage, _ = self._random_subsample(
612-
X, y, n_stage_samples
641+
X_stage, y_stage, weight_stage, _ = self._random_subsample(
642+
X, y, n_stage_samples, sample_weight
613643
)
614644

615-
self._fit(X_stage, y_stage, initial_guess=prev_coefs)
645+
self._fit(X_stage, y_stage, sample_weight=weight_stage, initial_guess=prev_coefs)
616646

617647
self.fitting_history_.append({
618648
'stage': stage,
@@ -631,9 +661,9 @@ def _progressive_fit(self, X, y):
631661
if frac < 1.0 and self._check_convergence(prev_coefs, self.coefficients_):
632662
if self.verbose:
633663
print(f"Early stopping: coefficients converged at stage {stage + 1}")
634-
# Still fit on full data for final refinement, but with fewer iterations expected
664+
# Still fit on full data for final refinement
635665
prev_coefs = self.coefficients_.copy()
636-
self._fit(X, y, initial_guess=prev_coefs)
666+
self._fit(X, y, sample_weight=sample_weight, initial_guess=prev_coefs)
637667
self.fitting_history_.append({
638668
'stage': stage + 1,
639669
'n_samples': n_samples,

src/splinator/monotonic_spline.py

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,10 +125,65 @@ def _get_design_matrix(
125125
).astype(float)
126126

127127

128-
def _fit_knots(X, num_knots):
129-
# type: (np.ndarray, int) -> np.ndarray
128+
def _weighted_quantile(values, quantiles, sample_weight=None):
129+
# type: (np.ndarray, np.ndarray, Optional[np.ndarray]) -> np.ndarray
130130
"""
131-
Generates knots by finding `num_knots` quantiles of the given input distribution
131+
Compute weighted quantiles.
132+
133+
Parameters
134+
----------
135+
values : array-like
136+
Values to compute quantiles for.
137+
quantiles : array-like
138+
Quantiles to compute, in [0, 1].
139+
sample_weight : array-like, optional
140+
Weights for each value.
141+
142+
Returns
143+
-------
144+
result : ndarray
145+
Quantile values.
146+
"""
147+
values = np.asarray(values)
148+
quantiles = np.asarray(quantiles)
149+
150+
if sample_weight is None:
151+
return np.quantile(values, quantiles)
152+
153+
sample_weight = np.asarray(sample_weight)
154+
sorted_indices = np.argsort(values)
155+
sorted_values = values[sorted_indices]
156+
sorted_weights = sample_weight[sorted_indices]
157+
cumsum = np.cumsum(sorted_weights)
158+
cumsum_normalized = cumsum / cumsum[-1]
159+
result = np.zeros(len(quantiles))
160+
for i, q in enumerate(quantiles):
161+
idx = np.searchsorted(cumsum_normalized, q)
162+
if idx >= len(sorted_values):
163+
idx = len(sorted_values) - 1
164+
result[i] = sorted_values[idx]
165+
166+
return result
167+
168+
169+
def _fit_knots(X, num_knots, sample_weight=None):
170+
# type: (np.ndarray, int, Optional[np.ndarray]) -> np.ndarray
171+
"""
172+
Generates knots by finding `num_knots` quantiles of the given input distribution.
173+
174+
Parameters
175+
----------
176+
X : ndarray
177+
1-D array of input values.
178+
num_knots : int
179+
Number of knots to generate.
180+
sample_weight : ndarray, optional
181+
Sample weights for weighted quantile computation.
182+
183+
Returns
184+
-------
185+
knots : ndarray
186+
Knot positions at evenly-spaced quantiles.
132187
"""
133188
if len(X.shape) != 1:
134189
raise ValueError("X must be a vector; has shape {}".format(X.shape))
@@ -139,5 +194,6 @@ def _fit_knots(X, num_knots):
139194
)
140195

141196
percentiles = np.linspace(0, 100, num_knots, endpoint=False)[1:]
197+
quantiles = percentiles / 100.0
142198

143-
return np.percentile(X, percentiles)
199+
return _weighted_quantile(X, quantiles, sample_weight)

tests/test_progressive_fitting.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,12 +128,13 @@ def test_stratified_sampling_method(self):
128128
model.random_state_ = np.random.RandomState(42)
129129
model.input_score_column_index = 0
130130

131-
X_sub, y_sub, indices = model._stratified_subsample(
131+
X_sub, y_sub, weight_sub, indices = model._stratified_subsample(
132132
self.X, self.y, n_samples=100, n_strata=10
133133
)
134134

135135
self.assertEqual(len(X_sub), 100)
136136
self.assertEqual(len(y_sub), 100)
137+
self.assertIsNone(weight_sub)
137138

138139
# Check coverage
139140
original_range = np.ptp(self.X[:, 0])

0 commit comments

Comments
 (0)