Merge branch 'main' of https://github.com/Leona-LYT/ReHLine-python

Leona-LYT · Leona-LYT · commit fc7fa595094c · 2026-04-09T20:33:31.000+08:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,10 +30,10 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: Install package and test dependencies
-        run: pip install ".[test]" pytest-cov
+        run: pip install -e ".[test]" pytest-cov
 
       - name: Run tests
-        run: pytest -v --tb=short --cov=rehline --cov-report=xml --cov-report=term
+        run: pytest -v --tb=short --cov-report=xml
 
       - name: Upload coverage to Codecov
         if: success()
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,11 @@ rehline.egg-info/
 *.log
 env/
 
+# Coverage reports
+.coverage
+coverage.xml
+htmlcov/
+
 # Files during package building
 eigen-3.4.0/
 eigen-5.0.1/
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,18 +34,37 @@ py-modules = ["build"]
 build = "cp*"
 
 [project.optional-dependencies]
-test = ["pytest >= 7.0", "pandas >= 1.5.0"]
+test = ["pytest >= 7.0", "pandas >= 1.5.0", "pytest-cov"]
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 python_files = ["test_*.py"]
 python_classes = ["Test*"]
 python_functions = ["test_*"]
+addopts = "--cov=rehline --cov-report=term-missing"
 
 [build-system]
 requires = ["requests ~= 2.31.0", "pybind11 ~= 3.0.0", "setuptools >= 80.0.0", "wheel >= 0.42.0", "setuptools-scm >= 8.0"]
 build-backend = "setuptools.build_meta"
 
+# --- Coverage ---
+[tool.coverage.run]
+source = ["rehline"]
+omit = ["rehline/_internal*"]
+relative_files = true
+
+[tool.coverage.paths]
+source = [
+    "rehline/",
+    "*/site-packages/rehline/",
+]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "if TYPE_CHECKING:",
+]
+
 # --- Ruff (linter + formatter) ---
 [tool.ruff]
 target-version = "py310"
diff --git a/rehline/_mf_class.py b/rehline/_mf_class.py
@@ -3,7 +3,6 @@
 import warnings
 
 import numpy as np
-import pandas as pd
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils.validation import _check_sample_weight
@@ -199,42 +198,22 @@ def __init__(
         tol_CD=1e-4,
         verbose=0,
     ):
-        # check input
-        errors = []
-        checks = [
-            (0 < rho < 1, "rho must be between 0 and 1"),
-            (C > 0, "C must be positive"),
-            (tol_CD > 0, "tol_CD must be positive"),
-            (tol > 0, "tol must be positive"),
-        ]
-        for condition, error_msg in checks:
-            if not condition:
-                errors.append(error_msg)
-        if errors:
-            raise ValueError("; ".join(errors))
-
         # parameter initialization
-        ## -----------------------------basic perameters-----------------------------
+        ## -----------------------------basic parameters-----------------------------
         self.n_users = n_users
         self.n_items = n_items
         self.loss = loss
         self.constraint_user = constraint_user if constraint_user is not None else []
         self.constraint_item = constraint_item if constraint_item is not None else []
         self.biased = biased
-        ## -----------------------------hyper perameters-----------------------------
+        ## -----------------------------hyper parameters-----------------------------
         self.rank = rank
         self.C = C
         self.rho = rho
-        ## --------------------------coefficient perameters--------------------------
+        ## -------------------------initialization parameters------------------------
         self.init_mean = init_mean
         self.init_sd = init_sd
         self.random_state = random_state
-        if self.random_state:
-            np.random.seed(random_state)
-        self.P = np.random.normal(loc=init_mean, scale=init_sd, size=(n_users, rank))
-        self.Q = np.random.normal(loc=init_mean, scale=init_sd, size=(n_items, rank))
-        self.bu = np.zeros(n_users) if self.biased else None
-        self.bi = np.zeros(n_items) if self.biased else None
         ## ----------------------------fitting parameters----------------------------
         self.max_iter_CD = max_iter_CD
         self.tol_CD = tol_CD
@@ -266,17 +245,62 @@ def fit(self, X, y, sample_weight=None):
             An instance of the estimator.
 
         """
+        # check input
+        ## parameter validation
+        errors = []
+        checks = [
+            (0 < self.rho < 1, "rho must be between 0 and 1"),
+            (self.C > 0, "C must be positive"),
+            (self.tol_CD > 0, "tol_CD must be positive"),
+            (self.tol > 0, "tol must be positive"),
+        ]
+        for condition, error_msg in checks:
+            if not condition:
+                errors.append(error_msg)
+        if errors:
+            raise ValueError("; ".join(errors))
+        
+        ## data validation
+        X = np.asarray(X)
+        y = np.asarray(y)
+        if X.ndim != 2 or X.shape[1] != 2:
+            raise ValueError("X must have shape (n_ratings, 2)")
+        if X.shape[0] != len(y):
+            raise ValueError("X and y must have the same number of samples")
+        user_ids = X[:, 0].astype(int)
+        item_ids = X[:, 1].astype(int)
+        if np.any(user_ids < 0) or np.any(user_ids >= self.n_users):
+            raise ValueError("User IDs must be in [0, n_users)")
+        if np.any(item_ids < 0) or np.any(item_ids >= self.n_items):
+            raise ValueError("Item IDs must be in [0, n_items)")
+        
         # Preparation
-        self.n_ratings = len(y)
-        self.history = np.nan * np.zeros((self.max_iter_CD + 1, 2))
+        ## number of training observations
+        self.n_ratings = len(y) 
+        ## convergence trace 
+        self.history = np.full((self.max_iter_CD + 1, 2), np.nan)
+        ## sample weights
         self.sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
-
-        X_df = pd.DataFrame(X, columns=["user", "item"])
-        uidx_map = X_df.groupby("user").indices
-        iidx_map = X_df.groupby("item").indices
-        self.Iu = [uidx_map.get(u, np.array([], dtype=int)) for u in range(self.n_users)]
-        self.Ui = [iidx_map.get(i, np.array([], dtype=int)) for i in range(self.n_items)]
-
+        ## random number generator
+        rng = np.random.default_rng(self.random_state) 
+
+        ## indices to locate interactions given a user or item id
+        ### user side: Iu[u] = row indices of interactions by user u
+        sort_idx_users = np.argsort(X[:, 0], kind='stable')
+        sorted_users = X[sort_idx_users, 0]
+        counts = np.unique(sorted_users, return_counts=True)[1]
+        self.Iu = [np.array([], dtype=int) for _ in range(self.n_users)]
+        for u, idxs in zip(sorted_users[np.cumsum(counts) - counts], np.split(sort_idx_users, np.cumsum(counts)[:-1])):
+            self.Iu[u] = idxs
+        ### item side: Ui[i] = row indices of interactions that involve item i
+        sort_idx_items = np.argsort(X[:, 1], kind='stable')
+        sorted_items = X[sort_idx_items, 1]
+        counts = np.unique(sorted_items, return_counts=True)[1]
+        self.Ui = [np.array([], dtype=int) for _ in range(self.n_items)]
+        for i, idxs in zip(sorted_items[np.cumsum(counts) - counts], np.split(sort_idx_items, np.cumsum(counts)[:-1])):
+            self.Ui[i] = idxs
+
+        ## effective C when updating user/item blocks (to match rehline formulation: C * PLQ_loss + 0.5 * l_2)
         C_user = self.C * self.n_users / (self.rho) / 2
         C_item = self.C * self.n_items / (1 - self.rho) / 2
 
@@ -289,6 +313,12 @@ def fit(self, X, y, sample_weight=None):
                 )
             )
 
+        # Model Initialization
+        self.P = rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_users, self.rank))
+        self.Q = rng.normal(loc=self.init_mean, scale=self.init_sd, size=(self.n_items, self.rank))
+        self.bu = np.zeros(self.n_users) if self.biased else None
+        self.bi = np.zeros(self.n_items) if self.biased else None
+
         # CD algorithm
         self.history[0] = self.obj(X, y)
         for iter_idx in range(self.max_iter_CD):
@@ -435,7 +465,7 @@ def fit(self, X, y, sample_weight=None):
                 obj = f"{self.history[iter_idx + 1][1]:.6f}"
                 print(f"{iter_idx + 1:<12} {mean_loss:<20} {obj:<20}")
 
-            if obj_diff < self.tol_CD:
+            if abs(obj_diff) < self.tol_CD:
                 break
 
         return self
@@ -496,9 +526,10 @@ def obj(self, X, y):
             item_penalty = np.sum(self.Q**2) * (1 - self.rho) / self.n_items
             penalty = user_penalty + item_penalty
 
-        y_pred = self.decision_function(X)
-        U, V, Tau, S, T = _make_loss_rehline_param(loss=self.loss, X=X, y=y)
+        X_dummy = np.ones((len(y), 1)) # not used in loss computation, only shape matters for loss param construction
+        U, V, Tau, S, T = _make_loss_rehline_param(loss=self.loss, X=X_dummy, y=y)
         loss = ReHLoss(U, V, S, T, Tau)
+        y_pred = self.decision_function(X)
         loss_term = loss(y_pred)
 
         return loss_term, self.C * loss_term + penalty
diff --git a/tests/test_mf.py b/tests/test_mf.py
@@ -110,6 +110,169 @@ def test_mf_hinge_classification_fits(mf_data):
     assert accuracy > 0.5, f"Hinge-loss MF accuracy ({accuracy:.3f}) should be > 0.5"
 
 
+def test_mf_data_validation_errors():
+    """Test data validation raises appropriate errors."""
+    # Test X with wrong shape (not 2 columns)
+    with pytest.raises(ValueError, match="X must have shape"):
+        model = plqMF_Ridge(n_users=10, n_items=10, loss={"name": "mae"})
+        model.fit(np.array([[0, 0, 0]]), np.array([1.0]))
+
+    # Test X and y mismatch
+    with pytest.raises(ValueError, match="X and y must have the same number"):
+        model = plqMF_Ridge(n_users=10, n_items=10, loss={"name": "mae"})
+        model.fit(np.array([[0, 0]]), np.array([1.0, 2.0]))
+
+    # Test invalid user ID (negative)
+    with pytest.raises(ValueError, match="User IDs must be in"):
+        model = plqMF_Ridge(n_users=10, n_items=10, loss={"name": "mae"})
+        model.fit(np.array([[-1, 0]]), np.array([1.0]))
+
+    # Test invalid user ID (>= n_users)
+    with pytest.raises(ValueError, match="User IDs must be in"):
+        model = plqMF_Ridge(n_users=10, n_items=10, loss={"name": "mae"})
+        model.fit(np.array([[10, 0]]), np.array([1.0]))
+
+    # Test invalid item ID (negative)
+    with pytest.raises(ValueError, match="Item IDs must be in"):
+        model = plqMF_Ridge(n_users=10, n_items=10, loss={"name": "mae"})
+        model.fit(np.array([[0, -1]]), np.array([1.0]))
+
+    # Test invalid item ID (>= n_items)
+    with pytest.raises(ValueError, match="Item IDs must be in"):
+        model = plqMF_Ridge(n_users=10, n_items=10, loss={"name": "mae"})
+        model.fit(np.array([[0, 10]]), np.array([1.0]))
+
+
+def test_mf_cold_start_users_items():
+    """Test cold start handling: users/items with no interactions."""
+    # Create data where user 0 and item 0 have no interactions
+    # n_users=3, n_items=3, but only users 1,2 and items 1,2 interact
+    X = np.array([[1, 1], [1, 2], [2, 1], [2, 2]])
+    y = np.array([3.0, 4.0, 2.0, 5.0])
+
+    model = plqMF_Ridge(
+        n_users=3,
+        n_items=3,
+        loss={"name": "mae"},
+        rank=2,
+        C=0.1,
+        max_iter=1000,
+        tol=0.01,
+    )
+    model.fit(X, y)
+
+    # Cold start user (user 0) should have zero factors and bias
+    assert np.allclose(model.P[0, :], 0.0)
+    assert model.bu[0] == 0.0
+
+    # Cold start item (item 0) should have zero factors and bias
+    assert np.allclose(model.Q[0, :], 0.0)
+    assert model.bi[0] == 0.0
+
+
+def test_mf_biased_false():
+    """Test plqMF_Ridge with biased=False (no bias terms)."""
+    n_users, n_items = 20, 30
+    X = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [2, 2], [3, 3]])
+    y = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+
+    model = plqMF_Ridge(
+        n_users=n_users,
+        n_items=n_items,
+        loss={"name": "mae"},
+        biased=False,
+        rank=3,
+        C=0.1,
+        max_iter=1000,
+        tol=0.01,
+    )
+    model.fit(X, y)
+
+    # bu and bi should be None when biased=False
+    assert model.bu is None
+    assert model.bi is None
+
+    # decision_function should work without biases
+    scores = model.decision_function(X)
+    assert scores.shape == (len(X),)
+
+    # obj should work without biases
+    loss_term, obj_val = model.obj(X, y)
+    assert np.isfinite(loss_term)
+    assert np.isfinite(obj_val)
+
+
+def test_mf_verbose_output(capsys):
+    """Test verbose printing (lines 308, 464-466)."""
+    X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
+    y = np.array([1.0, 2.0, 3.0, 4.0])
+
+    # Test verbose=1 (CD iteration progress)
+    model = plqMF_Ridge(
+        n_users=2,
+        n_items=2,
+        loss={"name": "mae"},
+        rank=2,
+        C=0.1,
+        max_iter=500,
+        tol=0.01,
+        max_iter_CD=2,
+        verbose=1,
+    )
+    model.fit(X, y)
+    captured = capsys.readouterr()
+    assert "Iteration" in captured.out
+    assert "Average Loss" in captured.out
+
+
+def test_mf_convergence_warning():
+    """Test convergence warning when max_iter is too small."""
+    from sklearn.exceptions import ConvergenceWarning
+
+    X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
+    y = np.array([1.0, 2.0, 3.0, 4.0])
+
+    model = plqMF_Ridge(
+        n_users=2,
+        n_items=2,
+        loss={"name": "mae"},
+        rank=2,
+        C=0.1,
+        max_iter=1,  # Only 1 iteration to guarantee non-convergence
+        tol=1e-10,
+        max_iter_CD=1,
+    )
+    with pytest.warns(ConvergenceWarning, match="ReHLine failed to converge"):
+        model.fit(X, y)
+
+
+def test_mf_param_validation_errors():
+    """Test parameter validation raises appropriate errors."""
+    # Test invalid rho (must be between 0 and 1)
+    with pytest.raises(ValueError, match="rho must be between 0 and 1"):
+        model = plqMF_Ridge(n_users=10, n_items=10, loss={"name": "mae"}, rho=0.0)
+        model.fit(np.array([[0, 0]]), np.array([1.0]))
+
+    with pytest.raises(ValueError, match="rho must be between 0 and 1"):
+        model = plqMF_Ridge(n_users=10, n_items=10, loss={"name": "mae"}, rho=1.0)
+        model.fit(np.array([[0, 0]]), np.array([1.0]))
+
+    # Test invalid C (must be positive)
+    with pytest.raises(ValueError, match="C must be positive"):
+        model = plqMF_Ridge(n_users=10, n_items=10, loss={"name": "mae"}, C=0.0)
+        model.fit(np.array([[0, 0]]), np.array([1.0]))
+
+    # Test invalid tol_CD (must be positive)
+    with pytest.raises(ValueError, match="tol_CD must be positive"):
+        model = plqMF_Ridge(n_users=10, n_items=10, loss={"name": "mae"}, tol_CD=0.0)
+        model.fit(np.array([[0, 0]]), np.array([1.0]))
+
+    # Test invalid tol (must be positive)
+    with pytest.raises(ValueError, match="tol must be positive"):
+        model = plqMF_Ridge(n_users=10, n_items=10, loss={"name": "mae"}, tol=0.0)
+        model.fit(np.array([[0, 0]]), np.array([1.0]))
+
+
 def test_mf_nonneg_constraint(mf_data):
     """plqMF_Ridge with non-negative constraints should produce non-negative factors."""
     d = mf_data