Skip to content

Commit f13a4ff

Browse files
Merge pull request #21 from rbroc/docstrings
add/fix docstrings, mainly for feature importance functions
2 parents bd126c0 + fb3d0ee commit f13a4ff

2 files changed

Lines changed: 48 additions & 2 deletions

File tree

turftopic/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def print_highest_ranking_documents(
161161
ID of the topic to display.
162162
raw_documents: list of str
163163
List of documents to consider.
164-
document_topic_matrix: ndarray of shape (n_topics, n_topics), optional
164+
document_topic_matrix: ndarray of shape (n_documents, n_topics), optional
165165
Document topic matrix to use. This is useful for transductive methods,
166166
as they cannot infer topics from text.
167167
top_k: int, default 5

turftopic/feature_importance.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,32 @@
11
import numpy as np
22
import scipy.sparse as spr
33
from sklearn.metrics import pairwise_distances
4-
from sklearn.preprocessing import normalize
54

65

76
def cluster_centroid_distance(
87
cluster_centroids: np.ndarray,
98
vocab_embeddings: np.ndarray,
109
metric="cosine",
1110
) -> np.ndarray:
11+
"""Computes feature importances based on distances between
12+
topic vectors (cluster centroids) and term embeddings
13+
14+
Parameters
15+
----------
16+
cluster_centroids: np.ndarray
17+
Coordinates of cluster centroids of shape (n_topics, embedding_size)
18+
vocab_embeddings: np.ndarray
19+
Term embeddings of shape (vocab_size, embedding_size)
20+
metric: str, defaul 'cosine'
21+
Metric used to compute distance from centroid.
22+
See documentation for sklearn.metrics.pairwise.distance_metrics
23+
for valid values.
24+
25+
Returns
26+
-------
27+
ndarray of shape (n_topics, vocab_size)
28+
Term importance matrix.
29+
"""
1230
distances = pairwise_distances(
1331
cluster_centroids, vocab_embeddings, metric=metric
1432
)
@@ -23,6 +41,20 @@ def cluster_centroid_distance(
2341
def soft_ctf_idf(
2442
doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
2543
) -> np.ndarray:
44+
"""Computes feature importances using Soft C-TF-IDF
45+
46+
Parameters
47+
----------
48+
doc_topic_matrix: np.ndarray
49+
Document-topic matrix of shape (n_documents, n_topics)
50+
doc_term_matrix: np.ndarray
51+
Document-term matrix of shape (n_documents, vocab_size)
52+
53+
Returns
54+
-------
55+
ndarray of shape (n_topics, vocab_size)
56+
Term importance matrix.
57+
"""
2658
eps = np.finfo(float).eps
2759
term_importance = doc_topic_matrix.T @ doc_term_matrix
2860
overall_in_topic = np.abs(term_importance).sum(axis=1)
@@ -36,6 +68,20 @@ def soft_ctf_idf(
3668
def ctf_idf(
3769
doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
3870
) -> np.ndarray:
71+
"""Computes feature importances using standard C-TF-IDF
72+
73+
Parameters
74+
----------
75+
doc_topic_matrix: np.ndarray
76+
Document-topic matrix of shape (n_documents, n_topics)
77+
doc_term_matrix: np.ndarray
78+
Document-term matrix of shape (n_documents, vocab_size)
79+
80+
Returns
81+
-------
82+
ndarray of shape (n_topics, vocab_size)
83+
Term importance matrix.
84+
"""
3985
labels = np.argmax(doc_topic_matrix, axis=1)
4086
n_topics = doc_topic_matrix.shape[1]
4187
components = []

0 commit comments

Comments
 (0)