Skip to content

Commit 1587e77

Browse files
Added docstrings to Topeax
1 parent 14551be commit 1587e77

1 file changed

Lines changed: 30 additions & 7 deletions

File tree

turftopic/models/topeax.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
from typing import Optional, Union
22

33
import numpy as np
4-
from scipy.ndimage.filters import maximum_filter
5-
from scipy.ndimage.morphology import binary_erosion, generate_binary_structure
4+
from scipy.ndimage import (
5+
binary_erosion,
6+
generate_binary_structure,
7+
maximum_filter,
8+
)
69
from scipy.stats import gaussian_kde
710
from sklearn.base import BaseEstimator, ClusterMixin
811
from sklearn.feature_extraction.text import CountVectorizer
@@ -19,11 +22,6 @@
1922
from turftopic.models.gmm import GMM, LexicalWordImportance
2023

2124

22-
def minmax(a):
23-
min_a = np.min(a)
24-
return (a - min_a) / (np.max(a) - min_a)
25-
26-
2725
def detect_peaks(image):
2826
# define an 8-connected neighborhood
2927
neighborhood = generate_binary_structure(2, 25)
@@ -58,6 +56,14 @@ def _m_step(self, X, log_resp):
5856

5957

6058
class Peax(ClusterMixin, BaseEstimator):
59+
"""Clustering model based on density peaks.
60+
61+
Parameters
62+
----------
63+
random_state: int, default None
64+
Random seed to use for fitting gaussian mixture to peaks.
65+
"""
66+
6167
def __init__(self, random_state: Optional[int] = None):
6268
self.random_state = random_state
6369

@@ -120,6 +126,23 @@ def score(self, X):
120126

121127

122128
class Topeax(GMM):
129+
"""Topic model based on the Peax clustering algorithm.
130+
The algorithm discovers the number of topics automatically, and is based on GMM.
131+
132+
Parameters
133+
----------
134+
encoder: str or SentenceTransformer
135+
Model to encode documents/terms, all-MiniLM-L6-v2 is the default.
136+
vectorizer: CountVectorizer, default None
137+
Vectorizer used for term extraction.
138+
Can be used to prune or filter the vocabulary.
139+
perplexity: int, default 50
140+
Number of neighbours to take into account when running TSNE.
141+
random_state: int, default None
142+
Random state to use so that results are exactly reproducible.
143+
144+
"""
145+
123146
def __init__(
124147
self,
125148
encoder: Union[

0 commit comments

Comments
 (0)