11from abc import ABC , abstractmethod
2- from datetime import datetime
2+ from datetime import datetime , timedelta
33from typing import Any , Optional , Union
44
55import numpy as np
@@ -17,15 +17,53 @@ def bin_timestamps(
1717 raise TypeError ("Timestamps have to be `datetime` objects." )
1818 unix_timestamps = [timestamp .timestamp () for timestamp in timestamps ]
1919 if isinstance (bins , list ):
20+ if min (timestamps ) < min (bins ):
21+ raise ValueError (
22+ f"Earliest timestamp ({ min (timestamps )} ) is not later or the same as first bin edge ({ min (bins )} )."
23+ )
24+ if max (timestamps ) >= max (bins ):
25+ raise ValueError (
26+ f"Latest timestamp ({ max (timestamps )} ) is not earlier than last bin edge ({ max (bins )} )."
27+ )
2028 unix_bins = [bin .timestamp () for bin in bins ]
21- return np .digitize (unix_timestamps , unix_bins ), bins
29+ # Have to substract one, else it starts from one
30+ return np .digitize (unix_timestamps , unix_bins ) - 1 , bins
2231 else :
32+ # Adding one day, so that the maximum value is still included.
33+ max_timestamp = max (timestamps ) + timedelta (days = 1 )
2334 unix_bins = np .histogram_bin_edges (unix_timestamps , bins = bins )
35+ unix_bins [- 1 ] = max_timestamp .timestamp ()
2436 bins = [datetime .fromtimestamp (ts ) for ts in unix_bins ]
25- return np .digitize (unix_timestamps , unix_bins ), bins
37+ # Have to substract one, else it starts from one
38+ return np .digitize (unix_timestamps , unix_bins ) - 1 , bins
2639
2740
2841class DynamicTopicModel (ABC ):
42+ @staticmethod
43+ def bin_timestamps (
44+ timestamps : list [datetime ], bins : Union [int , list [datetime ]] = 10
45+ ) -> tuple [np .ndarray , list [datetime ]]:
46+ """Bins timestamps based on given bins.
47+
48+ Parameters
49+ ----------
50+ timestamps: list[datetime]
51+ List of timestamps for documents.
52+ bins: int or list[datetime], default 10
53+ Time bins to use.
54+ If the bins are an int (N), N equally sized bins are used.
55+ Otherwise they should be bin edges, including the last and first edge.
56+ Bins are inclusive at the lower end and exclusive at the upper (lower <= timestamp < upper).
57+
58+ Returns
59+ -------
60+ time_labels: ndarray of int
61+ Labels for time slice in each document.
62+ bin_edges: list[datetime]
63+ List of edges for time bins.
64+ """
65+ return bin_timestamps (timestamps , bins )
66+
2967 @abstractmethod
3068 def fit_transform_dynamic (
3169 self ,
@@ -79,6 +117,9 @@ def fit_dynamic(
79117 When an `int`, the corpus will be divided into N equal time slices.
80118 When a list, it describes the edges of each time slice including the starting
81119 and final edges of the slices.
120+
121+ Note: The final edge is not included. You might want to add one day to
122+ the last bin edge if it equals the last timestamp.
82123 """
83124 self .fit_transform_dynamic (raw_documents , timestamps , embeddings , bins )
84125 return self
@@ -273,7 +314,7 @@ def plot_topics_over_time(self, top_k: int = 6):
273314 continue
274315 high = high [np .argsort (- values )]
275316 name_over_time .append (", " .join (vocab [high ]))
276- times = self .time_bin_edges [1 : ]
317+ times = self .time_bin_edges [: - 1 ]
277318 fig .add_trace (
278319 go .Scatter (
279320 x = times ,
0 commit comments