|
1 | 1 | from typing import Optional, Union |
2 | 2 |
|
3 | 3 | import numpy as np |
4 | | -from scipy.ndimage.filters import maximum_filter |
5 | | -from scipy.ndimage.morphology import binary_erosion, generate_binary_structure |
| 4 | +from scipy.ndimage import ( |
| 5 | + binary_erosion, |
| 6 | + generate_binary_structure, |
| 7 | + maximum_filter, |
| 8 | +) |
6 | 9 | from scipy.stats import gaussian_kde |
7 | 10 | from sklearn.base import BaseEstimator, ClusterMixin |
8 | 11 | from sklearn.feature_extraction.text import CountVectorizer |
|
19 | 22 | from turftopic.models.gmm import GMM, LexicalWordImportance |
20 | 23 |
|
21 | 24 |
|
22 | | -def minmax(a): |
23 | | - min_a = np.min(a) |
24 | | - return (a - min_a) / (np.max(a) - min_a) |
25 | | - |
26 | | - |
27 | 25 | def detect_peaks(image): |
28 | 26 | # define an 8-connected neighborhood |
29 | 27 | neighborhood = generate_binary_structure(2, 25) |
@@ -58,6 +56,14 @@ def _m_step(self, X, log_resp): |
58 | 56 |
|
59 | 57 |
|
60 | 58 | class Peax(ClusterMixin, BaseEstimator): |
| 59 | + """Clustering model based on density peaks. |
| 60 | +
|
| 61 | + Parameters |
| 62 | + ---------- |
| 63 | + random_state: int, default None |
| 64 | + Random seed to use for fitting gaussian mixture to peaks. |
| 65 | + """ |
| 66 | + |
61 | 67 | def __init__(self, random_state: Optional[int] = None): |
62 | 68 | self.random_state = random_state |
63 | 69 |
|
@@ -120,6 +126,23 @@ def score(self, X): |
120 | 126 |
|
121 | 127 |
|
122 | 128 | class Topeax(GMM): |
| 129 | + """Topic model based on the Peax clustering algorithm. |
| 130 | + The algorithm discovers the number of topics automatically, and is based on GMM. |
| 131 | +
|
| 132 | + Parameters |
| 133 | + ---------- |
| 134 | + encoder: str or SentenceTransformer |
| 135 | + Model to encode documents/terms, all-MiniLM-L6-v2 is the default. |
| 136 | + vectorizer: CountVectorizer, default None |
| 137 | + Vectorizer used for term extraction. |
| 138 | + Can be used to prune or filter the vocabulary. |
| 139 | + perplexity: int, default 50 |
| 140 | + Number of neighbours to take into account when running TSNE. |
| 141 | + random_state: int, default None |
| 142 | + Random state to use so that results are exactly reproducible. |
| 143 | +
|
| 144 | + """ |
| 145 | + |
123 | 146 | def __init__( |
124 | 147 | self, |
125 | 148 | encoder: Union[ |
|
0 commit comments