Merge pull request #49 from x-tabdeveloping/time_slice_fix

x-tabdeveloping · web-flow · commit 6ce16cb8d84d · 2024-06-24T18:13:51.000+02:00
Time slice fix
diff --git a/turftopic/dynamic.py b/turftopic/dynamic.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from datetime import datetime
+from datetime import datetime, timedelta
 from typing import Any, Optional, Union
 
 import numpy as np
@@ -17,15 +17,53 @@ def bin_timestamps(
         raise TypeError("Timestamps have to be `datetime` objects.")
     unix_timestamps = [timestamp.timestamp() for timestamp in timestamps]
     if isinstance(bins, list):
+        if min(timestamps) < min(bins):
+            raise ValueError(
+                f"Earliest timestamp ({min(timestamps)}) is not later or the same as first bin edge ({min(bins)})."
+            )
+        if max(timestamps) >= max(bins):
+            raise ValueError(
+                f"Latest timestamp ({max(timestamps)}) is not earlier than last bin edge ({max(bins)})."
+            )
         unix_bins = [bin.timestamp() for bin in bins]
-        return np.digitize(unix_timestamps, unix_bins), bins
+        # Have to substract one, else it starts from one
+        return np.digitize(unix_timestamps, unix_bins) - 1, bins
     else:
+        # Adding one day, so that the maximum value is still included.
+        max_timestamp = max(timestamps) + timedelta(days=1)
         unix_bins = np.histogram_bin_edges(unix_timestamps, bins=bins)
+        unix_bins[-1] = max_timestamp.timestamp()
         bins = [datetime.fromtimestamp(ts) for ts in unix_bins]
-        return np.digitize(unix_timestamps, unix_bins), bins
+        # Have to substract one, else it starts from one
+        return np.digitize(unix_timestamps, unix_bins) - 1, bins
 
 
 class DynamicTopicModel(ABC):
+    @staticmethod
+    def bin_timestamps(
+        timestamps: list[datetime], bins: Union[int, list[datetime]] = 10
+    ) -> tuple[np.ndarray, list[datetime]]:
+        """Bins timestamps based on given bins.
+
+        Parameters
+        ----------
+        timestamps: list[datetime]
+            List of timestamps for documents.
+        bins: int or list[datetime], default 10
+            Time bins to use.
+            If the bins are an int (N), N equally sized bins are used.
+            Otherwise they should be bin edges, including the last and first edge.
+            Bins are inclusive at the lower end and exclusive at the upper (lower <= timestamp < upper).
+
+        Returns
+        -------
+        time_labels: ndarray of int
+            Labels for time slice in each document.
+        bin_edges: list[datetime]
+            List of edges for time bins.
+        """
+        return bin_timestamps(timestamps, bins)
+
     @abstractmethod
     def fit_transform_dynamic(
         self,
@@ -79,6 +117,9 @@ def fit_dynamic(
             When an `int`, the corpus will be divided into N equal time slices.
             When a list, it describes the edges of each time slice including the starting
             and final edges of the slices.
+
+            Note: The final edge is not included. You might want to add one day to
+            the last bin edge if it equals the last timestamp.
         """
         self.fit_transform_dynamic(raw_documents, timestamps, embeddings, bins)
         return self
@@ -273,7 +314,7 @@ def plot_topics_over_time(self, top_k: int = 6):
                     continue
                 high = high[np.argsort(-values)]
                 name_over_time.append(", ".join(vocab[high]))
-            times = self.time_bin_edges[1:]
+            times = self.time_bin_edges[:-1]
             fig.add_trace(
                 go.Scatter(
                     x=times,
diff --git a/turftopic/models/_keynmf.py b/turftopic/models/_keynmf.py
@@ -254,7 +254,7 @@ def fit_transform_dynamic(
         time_bin_edges: list[datetime],
     ) -> np.ndarray:
         self.time_bin_edges = time_bin_edges
-        n_bins = len(time_bin_edges) + 1
+        n_bins = len(time_bin_edges) - 1
         document_term_matrix = self.vectorize(keywords, fitting=True)
         check_non_negative(document_term_matrix, "NMF (input X)")
         document_topic_matrix, H = _initialize_nmf(
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
@@ -12,7 +12,7 @@
 from sklearn.preprocessing import label_binarize
 
 from turftopic.base import ContextualModel, Encoder
-from turftopic.dynamic import DynamicTopicModel, bin_timestamps
+from turftopic.dynamic import DynamicTopicModel
 from turftopic.feature_importance import (
     cluster_centroid_distance,
     ctf_idf,
@@ -335,20 +335,26 @@ def fit_transform_dynamic(
         embeddings: Optional[np.ndarray] = None,
         bins: Union[int, list[datetime]] = 10,
     ):
-        time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)
-        temporal_components = []
-        temporal_importances = []
+        time_labels, self.time_bin_edges = self.bin_timestamps(
+            timestamps, bins
+        )
+        if hasattr(self, "components_"):
+            doc_topic_matrix = label_binarize(
+                self.labels_, classes=self.classes_
+            )
+        else:
+            doc_topic_matrix = self.fit_transform(
+                raw_documents, embeddings=embeddings
+            )
+        n_comp, n_vocab = self.components_.shape
+        n_bins = len(self.time_bin_edges) - 1
+        self.temporal_components_ = np.zeros(
+            (n_bins, n_comp, n_vocab), dtype=doc_topic_matrix.dtype
+        )
+        self.temporal_importance_ = np.zeros((n_bins, n_comp))
         if embeddings is None:
             embeddings = self.encoder_.encode(raw_documents)
-        for i_timebin in np.arange(len(self.time_bin_edges) - 1):
-            if hasattr(self, "components_"):
-                doc_topic_matrix = label_binarize(
-                    self.labels_, classes=self.classes_
-                )
-            else:
-                doc_topic_matrix = self.fit_transform(
-                    raw_documents, embeddings=embeddings
-                )
+        for i_timebin in np.unique(time_labels):
             topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(
                 axis=0
             )
@@ -382,8 +388,6 @@ def fit_transform_dynamic(
                 mask_terms = t_doc_term_matrix.sum(axis=0).astype(np.float64)
                 mask_terms[mask_terms == 0] = np.nan
                 components *= mask_terms
-            temporal_components.append(components)
-            temporal_importances.append(topic_importances)
-        self.temporal_components_ = np.stack(temporal_components)
-        self.temporal_importance_ = np.stack(temporal_importances)
+            self.temporal_components_[i_timebin] = components
+            self.temporal_importance_[i_timebin] = topic_importances
         return doc_topic_matrix
diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py
@@ -10,7 +10,7 @@
 from sklearn.pipeline import Pipeline, make_pipeline
 
 from turftopic.base import ContextualModel, Encoder
-from turftopic.dynamic import DynamicTopicModel, bin_timestamps
+from turftopic.dynamic import DynamicTopicModel
 from turftopic.feature_importance import soft_ctf_idf
 from turftopic.vectorizer import default_vectorizer
 
@@ -168,7 +168,9 @@ def fit_transform_dynamic(
         embeddings: Optional[np.ndarray] = None,
         bins: Union[int, list[datetime]] = 10,
     ):
-        time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)
+        time_labels, self.time_bin_edges = self.bin_timestamps(
+            timestamps, bins
+        )
         if hasattr(self, "components_"):
             doc_topic_matrix = self.transform(
                 raw_documents, embeddings=embeddings
@@ -178,9 +180,13 @@ def fit_transform_dynamic(
                 raw_documents, embeddings=embeddings
             )
         document_term_matrix = self.vectorizer.transform(raw_documents)
-        temporal_components = []
-        temporal_importances = []
-        for i_timebin in np.arange(len(self.time_bin_edges) - 1):
+        n_comp, n_vocab = self.components_.shape
+        n_bins = len(self.time_bin_edges) - 1
+        self.temporal_components_ = np.zeros(
+            (n_bins, n_comp, n_vocab), dtype=document_term_matrix.dtype
+        )
+        self.temporal_importance_ = np.zeros((n_bins, n_comp))
+        for i_timebin in np.unique(time_labels):
             topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(
                 axis=0
             )
@@ -190,8 +196,6 @@ def fit_transform_dynamic(
                 doc_topic_matrix[time_labels == i_timebin],
                 document_term_matrix[time_labels == i_timebin],  # type: ignore
             )
-            temporal_components.append(components)
-            temporal_importances.append(topic_importances)
-        self.temporal_components_ = np.stack(temporal_components)
-        self.temporal_importance_ = np.stack(temporal_importances)
+            self.temporal_components_[i_timebin] = components
+            self.temporal_importance_[i_timebin] = topic_importances
         return doc_topic_matrix
diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py
@@ -9,7 +9,7 @@
 
 from turftopic.base import ContextualModel, Encoder
 from turftopic.data import TopicData
-from turftopic.dynamic import DynamicTopicModel, bin_timestamps
+from turftopic.dynamic import DynamicTopicModel
 from turftopic.models._keynmf import KeywordExtractor, KeywordNMF
 
 
@@ -248,7 +248,9 @@ def fit_transform_dynamic(
             keywords = self.extract_keywords(
                 raw_documents, embeddings=embeddings
             )
-        time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)
+        time_labels, self.time_bin_edges = self.bin_timestamps(
+            timestamps, bins
+        )
         doc_topic_matrix = self.model.fit_transform_dynamic(
             keywords, time_labels, self.time_bin_edges
         )
@@ -300,7 +302,7 @@ def partial_fit_dynamic(
                 )
             else:
                 self.time_bin_edges = bins
-        time_labels, self.time_bin_edges = bin_timestamps(
+        time_labels, self.time_bin_edges = self.bin_timestamps(
             timestamps, self.time_bin_edges
         )
         if keywords is None:

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`
`10`	`10`	`from turftopic.base import ContextualModel, Encoder`
`11`	`11`	`from turftopic.data import TopicData`
`12`		`-from turftopic.dynamic import DynamicTopicModel, bin_timestamps`
	`12`	`+from turftopic.dynamic import DynamicTopicModel`
`13`	`13`	`from turftopic.models._keynmf import KeywordExtractor, KeywordNMF`
`14`	`14`
`15`	`15`
`@@ -248,7 +248,9 @@ def fit_transform_dynamic(`
`248`	`248`	`keywords = self.extract_keywords(`
`249`	`249`	`raw_documents, embeddings=embeddings`
`250`	`250`	`)`
`251`		`- time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)`
	`251`	`+ time_labels, self.time_bin_edges = self.bin_timestamps(`
	`252`	`+ timestamps, bins`
	`253`	`+ )`
`252`	`254`	`doc_topic_matrix = self.model.fit_transform_dynamic(`
`253`	`255`	`keywords, time_labels, self.time_bin_edges`
`254`	`256`	`)`
`@@ -300,7 +302,7 @@ def partial_fit_dynamic(`
`300`	`302`	`)`
`301`	`303`	`else:`
`302`	`304`	`self.time_bin_edges = bins`
`303`		`- time_labels, self.time_bin_edges = bin_timestamps(`
	`305`	`+ time_labels, self.time_bin_edges = self.bin_timestamps(`
`304`	`306`	`timestamps, self.time_bin_edges`
`305`	`307`	`)`
`306`	`308`	`if keywords is None:`