Skip to content

Commit 2615c16

Browse files
robertvavarobertvava
authored andcommitted
PCA init
1 parent 55532b8 commit 2615c16

6 files changed

Lines changed: 554 additions & 19 deletions

File tree

src/hyperview/cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,9 @@ def _build_parser() -> argparse.ArgumentParser:
9595
)
9696
parser.add_argument(
9797
"--method",
98-
choices=["umap"],
98+
choices=["umap", "pca"],
9999
default="umap",
100-
help="Projection method (currently only 'umap')",
100+
help="Projection method: 'umap' (default) or 'pca'",
101101
)
102102
parser.add_argument(
103103
"--geometry",

src/hyperview/core/dataset.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -439,11 +439,12 @@ def compute_visualization(
439439
440440
Args:
441441
space_key: Embedding space to project. If None, uses the first available.
442-
method: Projection method ('umap' supported).
442+
method: Projection method ('umap' or 'pca'). PCA is faster and
443+
deterministic; UMAP captures nonlinear structure better.
443444
geometry: Output geometry type ('euclidean' or 'poincare').
444-
n_neighbors: Number of neighbors for UMAP.
445-
min_dist: Minimum distance for UMAP.
446-
metric: Distance metric for UMAP.
445+
n_neighbors: Number of neighbors for UMAP (ignored for PCA).
446+
min_dist: Minimum distance for UMAP (ignored for PCA).
447+
metric: Distance metric for UMAP (ignored for PCA).
447448
force: Force recomputation even if layout exists.
448449
449450
Returns:

src/hyperview/embeddings/pipelines.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def compute_layout(
106106
Args:
107107
storage: Storage backend with embeddings.
108108
space_key: Embedding space to project. If None, uses the first available.
109-
method: Projection method ('umap' supported).
109+
method: Projection method ('umap' or 'pca').
110110
geometry: Output geometry type ('euclidean' or 'poincare').
111111
n_neighbors: Number of neighbors for UMAP.
112112
min_dist: Minimum distance for UMAP.
@@ -122,8 +122,8 @@ def compute_layout(
122122
"""
123123
from hyperview.embeddings.projection import ProjectionEngine
124124

125-
if method != "umap":
126-
raise ValueError(f"Invalid method: {method}. Only 'umap' is supported.")
125+
if method not in ("umap", "pca"):
126+
raise ValueError(f"Invalid method: {method}. Supported methods: 'umap', 'pca'.")
127127

128128
if geometry not in ("euclidean", "poincare"):
129129
raise ValueError(f"Invalid geometry: {geometry}. Must be 'euclidean' or 'poincare'.")
@@ -157,11 +157,15 @@ def compute_layout(
157157
if len(ids) < 3:
158158
raise ValueError(f"Need at least 3 samples for visualization, have {len(ids)}")
159159

160-
layout_params = {
161-
"n_neighbors": n_neighbors,
162-
"min_dist": min_dist,
163-
"metric": metric,
164-
}
160+
if method == "umap":
161+
layout_params = {
162+
"n_neighbors": n_neighbors,
163+
"min_dist": min_dist,
164+
"metric": metric,
165+
}
166+
else:
167+
# PCA is deterministic with no tuning parameters
168+
layout_params = {}
165169
layout_key = make_layout_key(space_key, method, geometry, layout_params)
166170

167171
if not force:

src/hyperview/embeddings/projection.py

Lines changed: 251 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,124 @@
88

99
logger = logging.getLogger(__name__)
1010

11+
_MIN_SAMPLES = 2
12+
_EPS = 1e-7
1113

1214
class ProjectionEngine:
1315
"""Engine for projecting high-dimensional embeddings to 2D."""
1416

17+
def logmap_0_hyperboloid(
18+
self,
19+
embeddings: np.ndarray,
20+
curvature: float = 1.0,
21+
) -> np.ndarray:
22+
"""Logarithmic map from the hyperboloid to the tangent space at the origin.
23+
24+
Maps points on the hyperboloid H^D (Lorentz model) to the tangent space
25+
T_o H^D at the origin o = (1, 0, ..., 0). The tangent space is Euclidean
26+
R^D, so standard PCA can be applied afterward.
27+
28+
Simplification at the origin:
29+
<o, x>_L = -x_0 (Lorentz inner product with north pole)
30+
d_L(o, x) = arccosh(x_0)
31+
spatial = x[1:]
32+
tangent = d_L * spatial / ||spatial|| (if ||spatial|| > 0)
33+
34+
Args:
35+
embeddings: (N, D+1) array on the hyperboloid (first column is time-like).
36+
curvature: Positive curvature parameter c (sectional curvature is -c).
37+
38+
Returns:
39+
(N, D) array of tangent vectors at the origin.
40+
"""
41+
if embeddings.ndim != 2 or embeddings.shape[1] < 2:
42+
raise ValueError(
43+
"embeddings must have shape (N, D+1) with D >= 1, "
44+
f"got {embeddings.shape}"
45+
)
46+
47+
c = float(curvature)
48+
sqrt_c = np.sqrt(c)
49+
50+
t = embeddings[:, 0] # time component, shape (N,)
51+
x = embeddings[:, 1:] # spatial components, shape (N, D)
52+
53+
# Hyperbolic distance from origin: d = (1/sqrt(c)) * arccosh(sqrt(c) * t)
54+
# For c=1 this simplifies to arccosh(t).
55+
arg = np.clip(sqrt_c * t, 1.0 + _EPS, None) # ensure > 1 for arccosh
56+
theta = np.arccosh(arg) / sqrt_c # shape (N,)
57+
58+
norm_x = np.linalg.norm(x, axis=1) # shape (N,)
59+
# For points very close to the origin, norm_x ≈ 0 and theta ≈ 0.
60+
# The tangent vector is simply 0 in that case.
61+
safe_norm = np.where(norm_x > _EPS, norm_x, 1.0)
62+
scale = np.where(norm_x > _EPS, theta / safe_norm, 0.0)
63+
64+
tangent = x * scale[:, np.newaxis]
65+
66+
logger.debug(
67+
"logmap_0: input %s -> tangent %s, max_dist=%.4f",
68+
embeddings.shape,
69+
tangent.shape,
70+
float(theta.max()) if len(theta) else 0.0,
71+
)
72+
return tangent.astype(np.float32)
73+
74+
def expmap_0_hyperboloid(
75+
self,
76+
tangent_vectors: np.ndarray,
77+
curvature: float = 1.0,
78+
) -> np.ndarray:
79+
"""Exponential map from the tangent space at the origin back to the hyperboloid.
80+
81+
Inverse of ``logmap_0_hyperboloid``.
82+
83+
expmap_0(v):
84+
norm_v = ||v||
85+
t = cosh(sqrt(c) * norm_v) / sqrt(c) ... but for unit hyperboloid (c=1): t = cosh(norm_v)
86+
spatial = sinh(sqrt(c) * norm_v) / (sqrt(c) * norm_v) * v
87+
88+
For c = 1:
89+
t = cosh(norm_v)
90+
spatial = sinh(norm_v) * v / norm_v
91+
92+
Args:
93+
tangent_vectors: (N, D) array in the tangent space at the origin.
94+
curvature: Positive curvature parameter c.
95+
96+
Returns:
97+
(N, D+1) array of points on the hyperboloid.
98+
"""
99+
if tangent_vectors.ndim != 2:
100+
raise ValueError(
101+
f"tangent_vectors must be 2-D, got shape {tangent_vectors.shape}"
102+
)
103+
104+
c = float(curvature)
105+
sqrt_c = np.sqrt(c)
106+
107+
norm_v = np.linalg.norm(tangent_vectors, axis=1) # (N,)
108+
scaled_norm = sqrt_c * norm_v
109+
110+
t = np.cosh(scaled_norm) / sqrt_c # time component
111+
# sinh(x)/x -> 1 as x -> 0
112+
safe_scaled = np.where(scaled_norm > _EPS, scaled_norm, 1.0)
113+
coeff = np.where(
114+
scaled_norm > _EPS,
115+
np.sinh(scaled_norm) / safe_scaled,
116+
1.0,
117+
)
118+
spatial = tangent_vectors * coeff[:, np.newaxis]
119+
120+
result = np.column_stack([t, spatial])
121+
122+
logger.debug(
123+
"expmap_0: tangent %s -> hyperboloid %s",
124+
tangent_vectors.shape,
125+
result.shape,
126+
)
127+
return result.astype(np.float32)
128+
15129
def to_poincare_ball(
16130
self,
17131
hyperboloid_embeddings: np.ndarray,
@@ -52,7 +166,6 @@ def to_poincare_ball(
52166
# Rescale to unit ball: u = u_R / R = sqrt(c) * u_R
53167
u = u_R / R
54168

55-
# Numerical guard: ensure inside the unit ball
56169
radii = np.linalg.norm(u, axis=1)
57170
mask = radii >= clamp_radius
58171
if np.any(mask):
@@ -77,14 +190,14 @@ def project(
77190
78191
This separates two concerns:
79192
1) Geometry/model transforms for the *input* embeddings (e.g. hyperboloid -> Poincaré)
80-
2) Dimensionality reduction / layout (currently UMAP)
193+
2) Dimensionality reduction / layout (UMAP or PCA)
81194
82195
Args:
83196
embeddings: Input embeddings (N x D) or hyperboloid (N x D+1).
84197
input_geometry: Geometry/model of the input embeddings (euclidean, hyperboloid).
85198
output_geometry: Geometry of the output coordinates (euclidean, poincare).
86199
curvature: Curvature parameter for hyperbolic embeddings (positive c).
87-
method: Layout method (currently only 'umap').
200+
method: Layout method ('umap' or 'pca').
88201
n_neighbors: UMAP neighbors.
89202
min_dist: UMAP min_dist.
90203
metric: Input metric (used for euclidean inputs).
@@ -93,8 +206,18 @@ def project(
93206
Returns:
94207
2D coordinates (N x 2).
95208
"""
96-
if method != "umap":
97-
raise ValueError(f"Invalid method: {method}. Only 'umap' is supported.")
209+
if method not in ("umap", "pca"):
210+
raise ValueError(
211+
f"Invalid method: {method}. Supported methods: 'umap', 'pca'."
212+
)
213+
214+
if method == "pca":
215+
return self._project_pca(
216+
embeddings,
217+
input_geometry=input_geometry,
218+
output_geometry=output_geometry,
219+
curvature=curvature or 1.0,
220+
)
98221

99222
prepared = embeddings
100223
prepared_metric: str = metric
@@ -127,6 +250,129 @@ def project(
127250
f"Invalid output_geometry: {output_geometry}. Must be 'euclidean' or 'poincare'."
128251
)
129252

253+
254+
def _project_pca(
255+
self,
256+
embeddings: np.ndarray,
257+
*,
258+
input_geometry: str,
259+
output_geometry: str,
260+
curvature: float = 1.0,
261+
) -> np.ndarray:
262+
"""Dispatch PCA projection based on input/output geometry.
263+
264+
Handles four combinations:
265+
- euclidean -> euclidean: Standard PCA, normalize to [-1, 1]
266+
- euclidean -> poincare: Standard PCA, tanh disk mapping
267+
- hyperboloid -> euclidean: logmap_0, PCA, normalize to [-1, 1]
268+
- hyperboloid -> poincare: logmap_0, PCA to 2D, expmap_0 back, stereographic to disk
269+
"""
270+
n_samples = embeddings.shape[0]
271+
if n_samples < _MIN_SAMPLES:
272+
raise ValueError(
273+
f"PCA requires at least {_MIN_SAMPLES} samples, got {n_samples}."
274+
)
275+
276+
if not np.all(np.isfinite(embeddings)):
277+
raise ValueError("Embeddings contain NaN or Inf values.")
278+
279+
logger.info(
280+
"PCA: input_geometry=%s, output_geometry=%s, N=%d, D=%d",
281+
input_geometry,
282+
output_geometry,
283+
n_samples,
284+
embeddings.shape[1],
285+
)
286+
287+
if input_geometry == "hyperboloid":
288+
data = self.logmap_0_hyperboloid(embeddings, curvature=curvature)
289+
else:
290+
data = embeddings
291+
292+
pca_2d, explained = self._pca_svd(data, n_components=2)
293+
294+
logger.info("PCA: explained variance ratio: [%.4f, %.4f]", *explained)
295+
296+
if output_geometry == "poincare":
297+
if input_geometry == "hyperboloid":
298+
# Tangent PCA -> expmap back to H^2 -> stereographic to Poincaré disk
299+
hyp_points = self.expmap_0_hyperboloid(pca_2d, curvature=curvature)
300+
coords = self.to_poincare_ball(hyp_points, curvature=curvature)
301+
coords = self._center_poincare(coords)
302+
coords = self._scale_poincare(coords, factor=0.65)
303+
else:
304+
# Euclidean PCA -> tanh disk mapping
305+
coords = self._pca_to_poincare_disk(pca_2d)
306+
return coords.astype(np.float32)
307+
308+
return self._normalize_coords(pca_2d).astype(np.float32)
309+
310+
def _pca_svd(
311+
self,
312+
data: np.ndarray,
313+
n_components: int = 2,
314+
) -> tuple[np.ndarray, np.ndarray]:
315+
"""Compute PCA via NumPy SVD. No scikit-learn dependency.
316+
317+
Args:
318+
data: (N, D) array of data points.
319+
n_components: Number of principal components to keep.
320+
321+
Returns:
322+
Tuple of:
323+
- (N, n_components) projected data
324+
- (n_components,) explained variance ratios
325+
"""
326+
data = data.astype(np.float64)
327+
mean = data.mean(axis=0)
328+
centered = data - mean
329+
330+
# Economy SVD (full_matrices=False) is O(N * D * min(N,D))
331+
_U, S, Vt = np.linalg.svd(centered, full_matrices=False)
332+
333+
projected = centered @ Vt[:n_components].T
334+
335+
variance = S**2 / max(len(data) - 1, 1)
336+
total_var = variance.sum()
337+
if total_var > 0:
338+
explained = variance[:n_components] / total_var
339+
else:
340+
explained = np.zeros(n_components)
341+
342+
return projected.astype(np.float32), explained
343+
344+
def _pca_to_poincare_disk(
345+
self,
346+
coords_2d: np.ndarray,
347+
alpha: float = 1.0,
348+
) -> np.ndarray:
349+
"""Map 2D Euclidean PCA output into the Poincaré disk via tanh scaling.
350+
351+
tanh(alpha * r) * z / r for each point z with r = ||z||
352+
353+
This is a homeomorphism R^2 -> B^2 (open unit ball).
354+
355+
Args:
356+
coords_2d: (N, 2) PCA coordinates.
357+
alpha: Scaling factor controlling radial compression (default 1.0).
358+
359+
Returns:
360+
(N, 2) Poincaré disk coordinates.
361+
"""
362+
max_abs = np.abs(coords_2d).max()
363+
if max_abs > _EPS:
364+
coords_2d = coords_2d / max_abs
365+
366+
radii = np.linalg.norm(coords_2d, axis=1)
367+
safe_radii = np.where(radii > _EPS, radii, 1.0)
368+
mapped_radii = np.tanh(alpha * radii)
369+
scale = np.where(radii > _EPS, mapped_radii / safe_radii, 0.0)
370+
poincare = coords_2d * scale[:, np.newaxis]
371+
poincare = self._center_poincare(poincare)
372+
poincare = self._scale_poincare(poincare, factor=0.65)
373+
374+
return poincare.astype(np.float32)
375+
130376
def project_umap(
131377
self,
132378
embeddings: np.ndarray,

tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)