Skip to content

Commit ef4c28d

Browse files
Revert "Started implementing hierarchical topic joining in clustering models"
This reverts commit c6ac90b.
1 parent c6ac90b commit ef4c28d

2 files changed

Lines changed: 3 additions & 110 deletions

File tree

turftopic/hierarchical.py

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -116,20 +116,6 @@ class TopicNode:
116116
document_topic_vector: Optional[np.ndarray] = None
117117
children: Optional[list[TopicNode]] = None
118118

119-
@property
120-
def components_(self) -> np.ndarray:
121-
if self.children is None:
122-
raise ValueError("Current node is a leaf, no components.")
123-
return np.stack([child.word_importance for child in self.children])
124-
125-
@property
126-
def doc_topic_matrix(self) -> np.ndarray:
127-
if self.children is None:
128-
raise ValueError("Current node is a leaf, no doc_topic_matrix.")
129-
return np.stack(
130-
[child.document_topic_vector for child in self.children]
131-
).T
132-
133119
@classmethod
134120
def create_root(
135121
cls,
@@ -160,14 +146,6 @@ def create_root(
160146
children=children,
161147
)
162148

163-
def set_path(self, path: tuple[int]):
164-
"""Sets path for current node and all children accordingly."""
165-
self.path = path
166-
if self.children is None:
167-
return
168-
for i_child, child in enumerate(self.children):
169-
child.set_path((*self.path, i_child))
170-
171149
@property
172150
def level(self) -> int:
173151
"""Indicates how deep down the hierarchy the topic is."""
@@ -297,29 +275,3 @@ def divide_children(self, n_subtopics: int, **kwargs):
297275
def plot_tree(self):
298276
"""Plots hierarchy as an interactive tree in Plotly."""
299277
return _tree_plot(self)
300-
301-
def join(self, *subtopics: int, **kwargs):
302-
slot = min(subtopics)
303-
max_subtopics = max(subtopics)
304-
if len(self.children) < (max_subtopics - 1):
305-
raise ValueError(
306-
"These subtopics don't exist on the current node."
307-
)
308-
if slot < 0:
309-
raise ValueError(
310-
"Outlier topics (-1) cannot be merged with other topics."
311-
)
312-
if self.children is None:
313-
raise ValueError(
314-
"Current Node is a leaf, children can't be joined."
315-
)
316-
try:
317-
self.children[slot] = self.model.join_subtopics(
318-
subtopics, self, **kwargs
319-
)
320-
self.set_path(self.path)
321-
except AttributeError as e:
322-
raise AttributeError(
323-
"Looks like your model is not an agglomerative hierarchical model."
324-
) from e
325-
return self

turftopic/models/cluster.py

Lines changed: 3 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import warnings
22
from datetime import datetime
3-
from typing import Iterable, Literal, Optional, Union
3+
from typing import Literal, Optional, Union
44

55
import numpy as np
66
from rich.console import Console
@@ -18,7 +18,6 @@
1818
from turftopic.feature_importance import (bayes_rule,
1919
cluster_centroid_distance, ctf_idf,
2020
soft_ctf_idf)
21-
from turftopic.hierarchical import TopicNode
2221
from turftopic.vectorizer import default_vectorizer
2322

2423
integer_message = """
@@ -231,12 +230,11 @@ def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray:
231230
]
232231
)
233232
old_labels = [label for label in self.classes_ if label != -1]
234-
clustering = AgglomerativeClustering(
233+
new_labels = AgglomerativeClustering(
235234
n_clusters=n_reduce_to,
236235
metric="cosine",
237236
linkage="average",
238-
)
239-
new_labels = clustering.fit_predict(interesting_topic_vectors)
237+
).fit_predict(interesting_topic_vectors)
240238
res = {}
241239
if -1 in self.classes_:
242240
res[-1] = -1
@@ -256,58 +254,6 @@ def _merge_smallest(self, n_reduce_to: int):
256254
labels[labels == from_topic] = to_topic
257255
return labels
258256

259-
def join_subtopics(
260-
self, subtopics: Iterable[int], hierarchy: Optional[TopicNode] = None
261-
) -> TopicNode:
262-
"""Joins subtopics in a topic hierarchy and returns the joint TopicNode.
263-
> Note that this method does not alter the underlying hierarchy!
264-
> You will need to use the join() method of a hierarchy for that.
265-
266-
Parameters
267-
----------
268-
subtopics: iterable of int
269-
Indices of subtopics to be joint.
270-
hierarchy: TopicNode, default None
271-
Hierarchy to join subtopics in, defaults to the root hierarchy of the model.
272-
273-
Returns
274-
-------
275-
TopicNode
276-
New topic made up of the joint subtopics.
277-
"""
278-
if hierarchy is None:
279-
hierarchy = self.hierarchy
280-
subtopics = list(set(subtopics))
281-
slot = min(subtopics)
282-
max_subtopics = max(subtopics)
283-
if len(self.children) < (max_subtopics - 1):
284-
raise ValueError(
285-
"These subtopics don't exist on the current node."
286-
)
287-
if slot < 0:
288-
raise ValueError(
289-
"Outlier topics (-1) cannot be merged with other topics."
290-
)
291-
if self.children is None:
292-
raise ValueError(
293-
"Current Node is a leaf, children can't be joined."
294-
)
295-
path = (*hierarchy.path, slot)
296-
children = [self.hierarchy[sub] for sub in subtopics]
297-
doc_topic_vector = self.hierarchy.doc_topic_matrix[:, subtopics].sum(
298-
axis=1
299-
)
300-
rest = [
301-
doc_topic_vector
302-
for i_topic, doc_topic_vector in enumerate(
303-
self.hierarchy.doc_topic_matrix.T
304-
)
305-
if i_topic not in subtopics
306-
]
307-
doc_topic_matrix = np.stack([doc_topic_vector, rest]).T
308-
# TODO
309-
pass
310-
311257
def reduce_topics(
312258
self,
313259
n_reduce_to: int,
@@ -340,7 +286,6 @@ def reduce_topics(
340286
self.labels_ = self._merge_smallest(n_reduce_to)
341287
elif reduction_method == "agglomerative":
342288
self.labels_ = self._merge_agglomerative(n_reduce_to)
343-
self.estimate_components(self.feature_importance)
344289
return self.labels_
345290

346291
def reset_reduction(self):
@@ -381,10 +326,6 @@ def estimate_components(
381326
)
382327
clusters = np.unique(self.labels_)
383328
self.classes_ = np.sort(clusters)
384-
if -1 in self.classes_:
385-
# Putting outliers in the last position, so that when you index things,
386-
# it works.
387-
self.classes_ = np.array([*self.classes_[1:], -1])
388329
self.topic_sizes_ = np.array(
389330
[np.sum(self.labels_ == label) for label in self.classes_]
390331
)

0 commit comments

Comments
 (0)