11import numpy as np
22import scipy .sparse as spr
33from sklearn .metrics import pairwise_distances
4- from sklearn .preprocessing import normalize
54
65
76def cluster_centroid_distance (
87 cluster_centroids : np .ndarray ,
98 vocab_embeddings : np .ndarray ,
109 metric = "cosine" ,
1110) -> np .ndarray :
11+ """Computes feature importances based on distances between
12+ topic vectors (cluster centroids) and term embeddings
13+
14+ Parameters
15+ ----------
16+ cluster_centroids: np.ndarray
17+ Coordinates of cluster centroids of shape (n_topics, embedding_size)
18+ vocab_embeddings: np.ndarray
19+ Term embeddings of shape (vocab_size, embedding_size)
20+ metric: str, defaul 'cosine'
21+ Metric used to compute distance from centroid.
22+ See documentation for sklearn.metrics.pairwise.distance_metrics
23+ for valid values.
24+
25+ Returns
26+ -------
27+ ndarray of shape (n_topics, vocab_size)
28+ Term importance matrix.
29+ """
1230 distances = pairwise_distances (
1331 cluster_centroids , vocab_embeddings , metric = metric
1432 )
@@ -23,6 +41,20 @@ def cluster_centroid_distance(
2341def soft_ctf_idf (
2442 doc_topic_matrix : np .ndarray , doc_term_matrix : spr .csr_matrix
2543) -> np .ndarray :
44+ """Computes feature importances using Soft C-TF-IDF
45+
46+ Parameters
47+ ----------
48+ doc_topic_matrix: np.ndarray
49+ Document-topic matrix of shape (n_documents, n_topics)
50+ doc_term_matrix: np.ndarray
51+ Document-term matrix of shape (n_documents, vocab_size)
52+
53+ Returns
54+ -------
55+ ndarray of shape (n_topics, vocab_size)
56+ Term importance matrix.
57+ """
2658 eps = np .finfo (float ).eps
2759 term_importance = doc_topic_matrix .T @ doc_term_matrix
2860 overall_in_topic = np .abs (term_importance ).sum (axis = 1 )
@@ -36,6 +68,20 @@ def soft_ctf_idf(
3668def ctf_idf (
3769 doc_topic_matrix : np .ndarray , doc_term_matrix : spr .csr_matrix
3870) -> np .ndarray :
71+ """Computes feature importances using standard C-TF-IDF
72+
73+ Parameters
74+ ----------
75+ doc_topic_matrix: np.ndarray
76+ Document-topic matrix of shape (n_documents, n_topics)
77+ doc_term_matrix: np.ndarray
78+ Document-term matrix of shape (n_documents, vocab_size)
79+
80+ Returns
81+ -------
82+ ndarray of shape (n_topics, vocab_size)
83+ Term importance matrix.
84+ """
3985 labels = np .argmax (doc_topic_matrix , axis = 1 )
4086 n_topics = doc_topic_matrix .shape [1 ]
4187 components = []
0 commit comments