Skip to content

Commit 6ce16cb

Browse files
Merge pull request #49 from x-tabdeveloping/time_slice_fix
Time slice fix
2 parents 2460eb0 + ba88cb3 commit 6ce16cb

5 files changed

Lines changed: 85 additions & 34 deletions

File tree

turftopic/dynamic.py

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC, abstractmethod
2-
from datetime import datetime
2+
from datetime import datetime, timedelta
33
from typing import Any, Optional, Union
44

55
import numpy as np
@@ -17,15 +17,53 @@ def bin_timestamps(
1717
raise TypeError("Timestamps have to be `datetime` objects.")
1818
unix_timestamps = [timestamp.timestamp() for timestamp in timestamps]
1919
if isinstance(bins, list):
20+
if min(timestamps) < min(bins):
21+
raise ValueError(
22+
f"Earliest timestamp ({min(timestamps)}) is not later or the same as first bin edge ({min(bins)})."
23+
)
24+
if max(timestamps) >= max(bins):
25+
raise ValueError(
26+
f"Latest timestamp ({max(timestamps)}) is not earlier than last bin edge ({max(bins)})."
27+
)
2028
unix_bins = [bin.timestamp() for bin in bins]
21-
return np.digitize(unix_timestamps, unix_bins), bins
29+
# Have to substract one, else it starts from one
30+
return np.digitize(unix_timestamps, unix_bins) - 1, bins
2231
else:
32+
# Adding one day, so that the maximum value is still included.
33+
max_timestamp = max(timestamps) + timedelta(days=1)
2334
unix_bins = np.histogram_bin_edges(unix_timestamps, bins=bins)
35+
unix_bins[-1] = max_timestamp.timestamp()
2436
bins = [datetime.fromtimestamp(ts) for ts in unix_bins]
25-
return np.digitize(unix_timestamps, unix_bins), bins
37+
# Have to substract one, else it starts from one
38+
return np.digitize(unix_timestamps, unix_bins) - 1, bins
2639

2740

2841
class DynamicTopicModel(ABC):
42+
@staticmethod
43+
def bin_timestamps(
44+
timestamps: list[datetime], bins: Union[int, list[datetime]] = 10
45+
) -> tuple[np.ndarray, list[datetime]]:
46+
"""Bins timestamps based on given bins.
47+
48+
Parameters
49+
----------
50+
timestamps: list[datetime]
51+
List of timestamps for documents.
52+
bins: int or list[datetime], default 10
53+
Time bins to use.
54+
If the bins are an int (N), N equally sized bins are used.
55+
Otherwise they should be bin edges, including the last and first edge.
56+
Bins are inclusive at the lower end and exclusive at the upper (lower <= timestamp < upper).
57+
58+
Returns
59+
-------
60+
time_labels: ndarray of int
61+
Labels for time slice in each document.
62+
bin_edges: list[datetime]
63+
List of edges for time bins.
64+
"""
65+
return bin_timestamps(timestamps, bins)
66+
2967
@abstractmethod
3068
def fit_transform_dynamic(
3169
self,
@@ -79,6 +117,9 @@ def fit_dynamic(
79117
When an `int`, the corpus will be divided into N equal time slices.
80118
When a list, it describes the edges of each time slice including the starting
81119
and final edges of the slices.
120+
121+
Note: The final edge is not included. You might want to add one day to
122+
the last bin edge if it equals the last timestamp.
82123
"""
83124
self.fit_transform_dynamic(raw_documents, timestamps, embeddings, bins)
84125
return self
@@ -273,7 +314,7 @@ def plot_topics_over_time(self, top_k: int = 6):
273314
continue
274315
high = high[np.argsort(-values)]
275316
name_over_time.append(", ".join(vocab[high]))
276-
times = self.time_bin_edges[1:]
317+
times = self.time_bin_edges[:-1]
277318
fig.add_trace(
278319
go.Scatter(
279320
x=times,

turftopic/models/_keynmf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ def fit_transform_dynamic(
254254
time_bin_edges: list[datetime],
255255
) -> np.ndarray:
256256
self.time_bin_edges = time_bin_edges
257-
n_bins = len(time_bin_edges) + 1
257+
n_bins = len(time_bin_edges) - 1
258258
document_term_matrix = self.vectorize(keywords, fitting=True)
259259
check_non_negative(document_term_matrix, "NMF (input X)")
260260
document_topic_matrix, H = _initialize_nmf(

turftopic/models/cluster.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from sklearn.preprocessing import label_binarize
1313

1414
from turftopic.base import ContextualModel, Encoder
15-
from turftopic.dynamic import DynamicTopicModel, bin_timestamps
15+
from turftopic.dynamic import DynamicTopicModel
1616
from turftopic.feature_importance import (
1717
cluster_centroid_distance,
1818
ctf_idf,
@@ -335,20 +335,26 @@ def fit_transform_dynamic(
335335
embeddings: Optional[np.ndarray] = None,
336336
bins: Union[int, list[datetime]] = 10,
337337
):
338-
time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)
339-
temporal_components = []
340-
temporal_importances = []
338+
time_labels, self.time_bin_edges = self.bin_timestamps(
339+
timestamps, bins
340+
)
341+
if hasattr(self, "components_"):
342+
doc_topic_matrix = label_binarize(
343+
self.labels_, classes=self.classes_
344+
)
345+
else:
346+
doc_topic_matrix = self.fit_transform(
347+
raw_documents, embeddings=embeddings
348+
)
349+
n_comp, n_vocab = self.components_.shape
350+
n_bins = len(self.time_bin_edges) - 1
351+
self.temporal_components_ = np.zeros(
352+
(n_bins, n_comp, n_vocab), dtype=doc_topic_matrix.dtype
353+
)
354+
self.temporal_importance_ = np.zeros((n_bins, n_comp))
341355
if embeddings is None:
342356
embeddings = self.encoder_.encode(raw_documents)
343-
for i_timebin in np.arange(len(self.time_bin_edges) - 1):
344-
if hasattr(self, "components_"):
345-
doc_topic_matrix = label_binarize(
346-
self.labels_, classes=self.classes_
347-
)
348-
else:
349-
doc_topic_matrix = self.fit_transform(
350-
raw_documents, embeddings=embeddings
351-
)
357+
for i_timebin in np.unique(time_labels):
352358
topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(
353359
axis=0
354360
)
@@ -382,8 +388,6 @@ def fit_transform_dynamic(
382388
mask_terms = t_doc_term_matrix.sum(axis=0).astype(np.float64)
383389
mask_terms[mask_terms == 0] = np.nan
384390
components *= mask_terms
385-
temporal_components.append(components)
386-
temporal_importances.append(topic_importances)
387-
self.temporal_components_ = np.stack(temporal_components)
388-
self.temporal_importance_ = np.stack(temporal_importances)
391+
self.temporal_components_[i_timebin] = components
392+
self.temporal_importance_[i_timebin] = topic_importances
389393
return doc_topic_matrix

turftopic/models/gmm.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from sklearn.pipeline import Pipeline, make_pipeline
1111

1212
from turftopic.base import ContextualModel, Encoder
13-
from turftopic.dynamic import DynamicTopicModel, bin_timestamps
13+
from turftopic.dynamic import DynamicTopicModel
1414
from turftopic.feature_importance import soft_ctf_idf
1515
from turftopic.vectorizer import default_vectorizer
1616

@@ -168,7 +168,9 @@ def fit_transform_dynamic(
168168
embeddings: Optional[np.ndarray] = None,
169169
bins: Union[int, list[datetime]] = 10,
170170
):
171-
time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)
171+
time_labels, self.time_bin_edges = self.bin_timestamps(
172+
timestamps, bins
173+
)
172174
if hasattr(self, "components_"):
173175
doc_topic_matrix = self.transform(
174176
raw_documents, embeddings=embeddings
@@ -178,9 +180,13 @@ def fit_transform_dynamic(
178180
raw_documents, embeddings=embeddings
179181
)
180182
document_term_matrix = self.vectorizer.transform(raw_documents)
181-
temporal_components = []
182-
temporal_importances = []
183-
for i_timebin in np.arange(len(self.time_bin_edges) - 1):
183+
n_comp, n_vocab = self.components_.shape
184+
n_bins = len(self.time_bin_edges) - 1
185+
self.temporal_components_ = np.zeros(
186+
(n_bins, n_comp, n_vocab), dtype=document_term_matrix.dtype
187+
)
188+
self.temporal_importance_ = np.zeros((n_bins, n_comp))
189+
for i_timebin in np.unique(time_labels):
184190
topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(
185191
axis=0
186192
)
@@ -190,8 +196,6 @@ def fit_transform_dynamic(
190196
doc_topic_matrix[time_labels == i_timebin],
191197
document_term_matrix[time_labels == i_timebin], # type: ignore
192198
)
193-
temporal_components.append(components)
194-
temporal_importances.append(topic_importances)
195-
self.temporal_components_ = np.stack(temporal_components)
196-
self.temporal_importance_ = np.stack(temporal_importances)
199+
self.temporal_components_[i_timebin] = components
200+
self.temporal_importance_[i_timebin] = topic_importances
197201
return doc_topic_matrix

turftopic/models/keynmf.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from turftopic.base import ContextualModel, Encoder
1111
from turftopic.data import TopicData
12-
from turftopic.dynamic import DynamicTopicModel, bin_timestamps
12+
from turftopic.dynamic import DynamicTopicModel
1313
from turftopic.models._keynmf import KeywordExtractor, KeywordNMF
1414

1515

@@ -248,7 +248,9 @@ def fit_transform_dynamic(
248248
keywords = self.extract_keywords(
249249
raw_documents, embeddings=embeddings
250250
)
251-
time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)
251+
time_labels, self.time_bin_edges = self.bin_timestamps(
252+
timestamps, bins
253+
)
252254
doc_topic_matrix = self.model.fit_transform_dynamic(
253255
keywords, time_labels, self.time_bin_edges
254256
)
@@ -300,7 +302,7 @@ def partial_fit_dynamic(
300302
)
301303
else:
302304
self.time_bin_edges = bins
303-
time_labels, self.time_bin_edges = bin_timestamps(
305+
time_labels, self.time_bin_edges = self.bin_timestamps(
304306
timestamps, self.time_bin_edges
305307
)
306308
if keywords is None:

0 commit comments

Comments
 (0)