88
99logger = logging .getLogger (__name__ )
1010
11+ _MIN_SAMPLES = 2
12+ _EPS = 1e-7
1113
1214class ProjectionEngine :
1315 """Engine for projecting high-dimensional embeddings to 2D."""
1416
17+ def logmap_0_hyperboloid (
18+ self ,
19+ embeddings : np .ndarray ,
20+ curvature : float = 1.0 ,
21+ ) -> np .ndarray :
22+ """Logarithmic map from the hyperboloid to the tangent space at the origin.
23+
24+ Maps points on the hyperboloid H^D (Lorentz model) to the tangent space
25+ T_o H^D at the origin o = (1, 0, ..., 0). The tangent space is Euclidean
26+ R^D, so standard PCA can be applied afterward.
27+
28+ Simplification at the origin:
29+ <o, x>_L = -x_0 (Lorentz inner product with north pole)
30+ d_L(o, x) = arccosh(x_0)
31+ spatial = x[1:]
32+ tangent = d_L * spatial / ||spatial|| (if ||spatial|| > 0)
33+
34+ Args:
35+ embeddings: (N, D+1) array on the hyperboloid (first column is time-like).
36+ curvature: Positive curvature parameter c (sectional curvature is -c).
37+
38+ Returns:
39+ (N, D) array of tangent vectors at the origin.
40+ """
41+ if embeddings .ndim != 2 or embeddings .shape [1 ] < 2 :
42+ raise ValueError (
43+ "embeddings must have shape (N, D+1) with D >= 1, "
44+ f"got { embeddings .shape } "
45+ )
46+
47+ c = float (curvature )
48+ sqrt_c = np .sqrt (c )
49+
50+ t = embeddings [:, 0 ] # time component, shape (N,)
51+ x = embeddings [:, 1 :] # spatial components, shape (N, D)
52+
53+ # Hyperbolic distance from origin: d = (1/sqrt(c)) * arccosh(sqrt(c) * t)
54+ # For c=1 this simplifies to arccosh(t).
55+ arg = np .clip (sqrt_c * t , 1.0 + _EPS , None ) # ensure > 1 for arccosh
56+ theta = np .arccosh (arg ) / sqrt_c # shape (N,)
57+
58+ norm_x = np .linalg .norm (x , axis = 1 ) # shape (N,)
59+ # For points very close to the origin, norm_x ≈ 0 and theta ≈ 0.
60+ # The tangent vector is simply 0 in that case.
61+ safe_norm = np .where (norm_x > _EPS , norm_x , 1.0 )
62+ scale = np .where (norm_x > _EPS , theta / safe_norm , 0.0 )
63+
64+ tangent = x * scale [:, np .newaxis ]
65+
66+ logger .debug (
67+ "logmap_0: input %s -> tangent %s, max_dist=%.4f" ,
68+ embeddings .shape ,
69+ tangent .shape ,
70+ float (theta .max ()) if len (theta ) else 0.0 ,
71+ )
72+ return tangent .astype (np .float32 )
73+
74+ def expmap_0_hyperboloid (
75+ self ,
76+ tangent_vectors : np .ndarray ,
77+ curvature : float = 1.0 ,
78+ ) -> np .ndarray :
79+ """Exponential map from the tangent space at the origin back to the hyperboloid.
80+
81+ Inverse of ``logmap_0_hyperboloid``.
82+
83+ expmap_0(v):
84+ norm_v = ||v||
85+ t = cosh(sqrt(c) * norm_v) / sqrt(c) ... but for unit hyperboloid (c=1): t = cosh(norm_v)
86+ spatial = sinh(sqrt(c) * norm_v) / (sqrt(c) * norm_v) * v
87+
88+ For c = 1:
89+ t = cosh(norm_v)
90+ spatial = sinh(norm_v) * v / norm_v
91+
92+ Args:
93+ tangent_vectors: (N, D) array in the tangent space at the origin.
94+ curvature: Positive curvature parameter c.
95+
96+ Returns:
97+ (N, D+1) array of points on the hyperboloid.
98+ """
99+ if tangent_vectors .ndim != 2 :
100+ raise ValueError (
101+ f"tangent_vectors must be 2-D, got shape { tangent_vectors .shape } "
102+ )
103+
104+ c = float (curvature )
105+ sqrt_c = np .sqrt (c )
106+
107+ norm_v = np .linalg .norm (tangent_vectors , axis = 1 ) # (N,)
108+ scaled_norm = sqrt_c * norm_v
109+
110+ t = np .cosh (scaled_norm ) / sqrt_c # time component
111+ # sinh(x)/x -> 1 as x -> 0
112+ safe_scaled = np .where (scaled_norm > _EPS , scaled_norm , 1.0 )
113+ coeff = np .where (
114+ scaled_norm > _EPS ,
115+ np .sinh (scaled_norm ) / safe_scaled ,
116+ 1.0 ,
117+ )
118+ spatial = tangent_vectors * coeff [:, np .newaxis ]
119+
120+ result = np .column_stack ([t , spatial ])
121+
122+ logger .debug (
123+ "expmap_0: tangent %s -> hyperboloid %s" ,
124+ tangent_vectors .shape ,
125+ result .shape ,
126+ )
127+ return result .astype (np .float32 )
128+
15129 def to_poincare_ball (
16130 self ,
17131 hyperboloid_embeddings : np .ndarray ,
@@ -52,7 +166,6 @@ def to_poincare_ball(
52166 # Rescale to unit ball: u = u_R / R = sqrt(c) * u_R
53167 u = u_R / R
54168
55- # Numerical guard: ensure inside the unit ball
56169 radii = np .linalg .norm (u , axis = 1 )
57170 mask = radii >= clamp_radius
58171 if np .any (mask ):
@@ -77,14 +190,14 @@ def project(
77190
78191 This separates two concerns:
79192 1) Geometry/model transforms for the *input* embeddings (e.g. hyperboloid -> Poincaré)
80- 2) Dimensionality reduction / layout (currently UMAP)
193+ 2) Dimensionality reduction / layout (UMAP or PCA )
81194
82195 Args:
83196 embeddings: Input embeddings (N x D) or hyperboloid (N x D+1).
84197 input_geometry: Geometry/model of the input embeddings (euclidean, hyperboloid).
85198 output_geometry: Geometry of the output coordinates (euclidean, poincare).
86199 curvature: Curvature parameter for hyperbolic embeddings (positive c).
87- method: Layout method (currently only 'umap ').
200+ method: Layout method ('umap' or 'pca ').
88201 n_neighbors: UMAP neighbors.
89202 min_dist: UMAP min_dist.
90203 metric: Input metric (used for euclidean inputs).
@@ -93,8 +206,18 @@ def project(
93206 Returns:
94207 2D coordinates (N x 2).
95208 """
96- if method != "umap" :
97- raise ValueError (f"Invalid method: { method } . Only 'umap' is supported." )
209+ if method not in ("umap" , "pca" ):
210+ raise ValueError (
211+ f"Invalid method: { method } . Supported methods: 'umap', 'pca'."
212+ )
213+
214+ if method == "pca" :
215+ return self ._project_pca (
216+ embeddings ,
217+ input_geometry = input_geometry ,
218+ output_geometry = output_geometry ,
219+ curvature = curvature or 1.0 ,
220+ )
98221
99222 prepared = embeddings
100223 prepared_metric : str = metric
@@ -127,6 +250,129 @@ def project(
127250 f"Invalid output_geometry: { output_geometry } . Must be 'euclidean' or 'poincare'."
128251 )
129252
253+
254+ def _project_pca (
255+ self ,
256+ embeddings : np .ndarray ,
257+ * ,
258+ input_geometry : str ,
259+ output_geometry : str ,
260+ curvature : float = 1.0 ,
261+ ) -> np .ndarray :
262+ """Dispatch PCA projection based on input/output geometry.
263+
264+ Handles four combinations:
265+ - euclidean -> euclidean: Standard PCA, normalize to [-1, 1]
266+ - euclidean -> poincare: Standard PCA, tanh disk mapping
267+ - hyperboloid -> euclidean: logmap_0, PCA, normalize to [-1, 1]
268+ - hyperboloid -> poincare: logmap_0, PCA to 2D, expmap_0 back, stereographic to disk
269+ """
270+ n_samples = embeddings .shape [0 ]
271+ if n_samples < _MIN_SAMPLES :
272+ raise ValueError (
273+ f"PCA requires at least { _MIN_SAMPLES } samples, got { n_samples } ."
274+ )
275+
276+ if not np .all (np .isfinite (embeddings )):
277+ raise ValueError ("Embeddings contain NaN or Inf values." )
278+
279+ logger .info (
280+ "PCA: input_geometry=%s, output_geometry=%s, N=%d, D=%d" ,
281+ input_geometry ,
282+ output_geometry ,
283+ n_samples ,
284+ embeddings .shape [1 ],
285+ )
286+
287+ if input_geometry == "hyperboloid" :
288+ data = self .logmap_0_hyperboloid (embeddings , curvature = curvature )
289+ else :
290+ data = embeddings
291+
292+ pca_2d , explained = self ._pca_svd (data , n_components = 2 )
293+
294+ logger .info ("PCA: explained variance ratio: [%.4f, %.4f]" , * explained )
295+
296+ if output_geometry == "poincare" :
297+ if input_geometry == "hyperboloid" :
298+ # Tangent PCA -> expmap back to H^2 -> stereographic to Poincaré disk
299+ hyp_points = self .expmap_0_hyperboloid (pca_2d , curvature = curvature )
300+ coords = self .to_poincare_ball (hyp_points , curvature = curvature )
301+ coords = self ._center_poincare (coords )
302+ coords = self ._scale_poincare (coords , factor = 0.65 )
303+ else :
304+ # Euclidean PCA -> tanh disk mapping
305+ coords = self ._pca_to_poincare_disk (pca_2d )
306+ return coords .astype (np .float32 )
307+
308+ return self ._normalize_coords (pca_2d ).astype (np .float32 )
309+
310+ def _pca_svd (
311+ self ,
312+ data : np .ndarray ,
313+ n_components : int = 2 ,
314+ ) -> tuple [np .ndarray , np .ndarray ]:
315+ """Compute PCA via NumPy SVD. No scikit-learn dependency.
316+
317+ Args:
318+ data: (N, D) array of data points.
319+ n_components: Number of principal components to keep.
320+
321+ Returns:
322+ Tuple of:
323+ - (N, n_components) projected data
324+ - (n_components,) explained variance ratios
325+ """
326+ data = data .astype (np .float64 )
327+ mean = data .mean (axis = 0 )
328+ centered = data - mean
329+
330+ # Economy SVD (full_matrices=False) is O(N * D * min(N,D))
331+ _U , S , Vt = np .linalg .svd (centered , full_matrices = False )
332+
333+ projected = centered @ Vt [:n_components ].T
334+
335+ variance = S ** 2 / max (len (data ) - 1 , 1 )
336+ total_var = variance .sum ()
337+ if total_var > 0 :
338+ explained = variance [:n_components ] / total_var
339+ else :
340+ explained = np .zeros (n_components )
341+
342+ return projected .astype (np .float32 ), explained
343+
344+ def _pca_to_poincare_disk (
345+ self ,
346+ coords_2d : np .ndarray ,
347+ alpha : float = 1.0 ,
348+ ) -> np .ndarray :
349+ """Map 2D Euclidean PCA output into the Poincaré disk via tanh scaling.
350+
351+ tanh(alpha * r) * z / r for each point z with r = ||z||
352+
353+ This is a homeomorphism R^2 -> B^2 (open unit ball).
354+
355+ Args:
356+ coords_2d: (N, 2) PCA coordinates.
357+ alpha: Scaling factor controlling radial compression (default 1.0).
358+
359+ Returns:
360+ (N, 2) Poincaré disk coordinates.
361+ """
362+ max_abs = np .abs (coords_2d ).max ()
363+ if max_abs > _EPS :
364+ coords_2d = coords_2d / max_abs
365+
366+ radii = np .linalg .norm (coords_2d , axis = 1 )
367+ safe_radii = np .where (radii > _EPS , radii , 1.0 )
368+ mapped_radii = np .tanh (alpha * radii )
369+ scale = np .where (radii > _EPS , mapped_radii / safe_radii , 0.0 )
370+ poincare = coords_2d * scale [:, np .newaxis ]
371+ poincare = self ._center_poincare (poincare )
372+ poincare = self ._scale_poincare (poincare , factor = 0.65 )
373+
374+ return poincare .astype (np .float32 )
375+
130376 def project_umap (
131377 self ,
132378 embeddings : np .ndarray ,
0 commit comments