Skip to content

Commit 7d0deab

Browse files
Fixed timestamp binning and added static method to dynamic topic models
1 parent 2460eb0 commit 7d0deab

1 file changed

Lines changed: 44 additions & 3 deletions

File tree

turftopic/dynamic.py

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC, abstractmethod
2-
from datetime import datetime
2+
from datetime import datetime, timedelta
33
from typing import Any, Optional, Union
44

55
import numpy as np
@@ -17,15 +17,53 @@ def bin_timestamps(
1717
raise TypeError("Timestamps have to be `datetime` objects.")
1818
unix_timestamps = [timestamp.timestamp() for timestamp in timestamps]
1919
if isinstance(bins, list):
20+
if min(timestamps) < min(bins):
21+
raise ValueError(
22+
f"Earliest timestamp ({min(timestamps)}) is not later or the same as first bin edge ({min(bins)})."
23+
)
24+
if max(timestamps) >= max(bins):
25+
raise ValueError(
26+
f"Latest timestamp ({max(timestamps)}) is not earlier than last bin edge ({max(bins)})."
27+
)
2028
unix_bins = [bin.timestamp() for bin in bins]
21-
return np.digitize(unix_timestamps, unix_bins), bins
29+
# Have to substract one, else it starts from one
30+
return np.digitize(unix_timestamps, unix_bins) - 1, bins
2231
else:
32+
# Adding one day, so that the maximum value is still included.
33+
max_timestamp = max(timestamps) + timedelta(days=1)
2334
unix_bins = np.histogram_bin_edges(unix_timestamps, bins=bins)
35+
unix_bins[-1] = max_timestamp.timestamp()
2436
bins = [datetime.fromtimestamp(ts) for ts in unix_bins]
25-
return np.digitize(unix_timestamps, unix_bins), bins
37+
# Have to substract one, else it starts from one
38+
return np.digitize(unix_timestamps, unix_bins) - 1, bins
2639

2740

2841
class DynamicTopicModel(ABC):
42+
@staticmethod
43+
def bin_timestamps(
44+
timestamps: list[datetime], bins: Union[int, list[datetime]] = 10
45+
) -> tuple[np.ndarray, list[datetime]]:
46+
"""Bins timestamps based on given bins.
47+
48+
Parameters
49+
----------
50+
timestamps: list[datetime]
51+
List of timestamps for documents.
52+
bins: int or list[datetime], default 10
53+
Time bins to use.
54+
If the bins are an int (N), N equally sized bins are used.
55+
Otherwise they should be bin edges, including the last and first edge.
56+
Bins are inclusive at the lower end and exclusive at the upper (lower <= timestamp < upper).
57+
58+
Returns
59+
-------
60+
time_labels: ndarray of int
61+
Labels for time slice in each document.
62+
bin_edges: list[datetime]
63+
List of edges for time bins.
64+
"""
65+
return bin_timestamps(timestamps, bins)
66+
2967
@abstractmethod
3068
def fit_transform_dynamic(
3169
self,
@@ -79,6 +117,9 @@ def fit_dynamic(
79117
When an `int`, the corpus will be divided into N equal time slices.
80118
When a list, it describes the edges of each time slice including the starting
81119
and final edges of the slices.
120+
121+
Note: The final edge is not included. You might want to add one day to
122+
the last bin edge if it equals the last timestamp.
82123
"""
83124
self.fit_transform_dynamic(raw_documents, timestamps, embeddings, bins)
84125
return self

0 commit comments

Comments
 (0)