Started implementing hierarchical topic joining in clustering models

x-tabdeveloping · x-tabdeveloping · commit c6ac90b5f574 · 2024-08-27T10:31:39.000+02:00
diff --git a/turftopic/hierarchical.py b/turftopic/hierarchical.py
@@ -116,6 +116,20 @@ class TopicNode:
     document_topic_vector: Optional[np.ndarray] = None
     children: Optional[list[TopicNode]] = None
 
+    @property
+    def components_(self) -> np.ndarray:
+        if self.children is None:
+            raise ValueError("Current node is a leaf, no components.")
+        return np.stack([child.word_importance for child in self.children])
+
+    @property
+    def doc_topic_matrix(self) -> np.ndarray:
+        if self.children is None:
+            raise ValueError("Current node is a leaf, no doc_topic_matrix.")
+        return np.stack(
+            [child.document_topic_vector for child in self.children]
+        ).T
+
     @classmethod
     def create_root(
         cls,
@@ -146,6 +160,14 @@ def create_root(
             children=children,
         )
 
+    def set_path(self, path: tuple[int]):
+        """Sets path for current node and all children accordingly."""
+        self.path = path
+        if self.children is None:
+            return
+        for i_child, child in enumerate(self.children):
+            child.set_path((*self.path, i_child))
+
     @property
     def level(self) -> int:
         """Indicates how deep down the hierarchy the topic is."""
@@ -275,3 +297,29 @@ def divide_children(self, n_subtopics: int, **kwargs):
     def plot_tree(self):
         """Plots hierarchy as an interactive tree in Plotly."""
         return _tree_plot(self)
+
+    def join(self, *subtopics: int, **kwargs):
+        slot = min(subtopics)
+        max_subtopics = max(subtopics)
+        if len(self.children) < (max_subtopics - 1):
+            raise ValueError(
+                "These subtopics don't exist on the current node."
+            )
+        if slot < 0:
+            raise ValueError(
+                "Outlier topics (-1) cannot be merged with other topics."
+            )
+        if self.children is None:
+            raise ValueError(
+                "Current Node is a leaf, children can't be joined."
+            )
+        try:
+            self.children[slot] = self.model.join_subtopics(
+                subtopics, self, **kwargs
+            )
+            self.set_path(self.path)
+        except AttributeError as e:
+            raise AttributeError(
+                "Looks like your model is not an agglomerative hierarchical model."
+            ) from e
+        return self
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
@@ -1,6 +1,6 @@
 import warnings
 from datetime import datetime
-from typing import Literal, Optional, Union
+from typing import Iterable, Literal, Optional, Union
 
 import numpy as np
 from rich.console import Console
@@ -18,6 +18,7 @@
 from turftopic.feature_importance import (bayes_rule,
                                           cluster_centroid_distance, ctf_idf,
                                           soft_ctf_idf)
+from turftopic.hierarchical import TopicNode
 from turftopic.vectorizer import default_vectorizer
 
 integer_message = """
@@ -230,11 +231,12 @@ def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray:
             ]
         )
         old_labels = [label for label in self.classes_ if label != -1]
-        new_labels = AgglomerativeClustering(
+        clustering = AgglomerativeClustering(
             n_clusters=n_reduce_to,
             metric="cosine",
             linkage="average",
-        ).fit_predict(interesting_topic_vectors)
+        )
+        new_labels = clustering.fit_predict(interesting_topic_vectors)
         res = {}
         if -1 in self.classes_:
             res[-1] = -1
@@ -254,6 +256,58 @@ def _merge_smallest(self, n_reduce_to: int):
             labels[labels == from_topic] = to_topic
         return labels
 
+    def join_subtopics(
+        self, subtopics: Iterable[int], hierarchy: Optional[TopicNode] = None
+    ) -> TopicNode:
+        """Joins subtopics in a topic hierarchy and returns the joint TopicNode.
+        > Note that this method does not alter the underlying hierarchy!
+        > You will need to use the join() method of a hierarchy for that.
+
+        Parameters
+        ----------
+        subtopics: iterable of int
+            Indices of subtopics to be joint.
+        hierarchy: TopicNode, default None
+            Hierarchy to join subtopics in, defaults to the root hierarchy of the model.
+
+        Returns
+        -------
+        TopicNode
+            New topic made up of the joint subtopics.
+        """
+        if hierarchy is None:
+            hierarchy = self.hierarchy
+        subtopics = list(set(subtopics))
+        slot = min(subtopics)
+        max_subtopics = max(subtopics)
+        if len(self.children) < (max_subtopics - 1):
+            raise ValueError(
+                "These subtopics don't exist on the current node."
+            )
+        if slot < 0:
+            raise ValueError(
+                "Outlier topics (-1) cannot be merged with other topics."
+            )
+        if self.children is None:
+            raise ValueError(
+                "Current Node is a leaf, children can't be joined."
+            )
+        path = (*hierarchy.path, slot)
+        children = [self.hierarchy[sub] for sub in subtopics]
+        doc_topic_vector = self.hierarchy.doc_topic_matrix[:, subtopics].sum(
+            axis=1
+        )
+        rest = [
+            doc_topic_vector
+            for i_topic, doc_topic_vector in enumerate(
+                self.hierarchy.doc_topic_matrix.T
+            )
+            if i_topic not in subtopics
+        ]
+        doc_topic_matrix = np.stack([doc_topic_vector, rest]).T
+        # TODO
+        pass
+
     def reduce_topics(
         self,
         n_reduce_to: int,
@@ -286,6 +340,7 @@ def reduce_topics(
             self.labels_ = self._merge_smallest(n_reduce_to)
         elif reduction_method == "agglomerative":
             self.labels_ = self._merge_agglomerative(n_reduce_to)
+        self.estimate_components(self.feature_importance)
         return self.labels_
 
     def reset_reduction(self):
@@ -326,6 +381,10 @@ def estimate_components(
             )
         clusters = np.unique(self.labels_)
         self.classes_ = np.sort(clusters)
+        if -1 in self.classes_:
+            # Putting outliers in the last position, so that when you index things,
+            # it works.
+            self.classes_ = np.array([*self.classes_[1:], -1])
         self.topic_sizes_ = np.array(
             [np.sum(self.labels_ == label) for label in self.classes_]
         )