Skip to content

Commit 029a6d2

Browse files
add isconstant param to mstump (#871)
* add isconstant param to naive mstump and relevant naive functions * rename param and add support if it is callable object * add test function for param isconstant, expecting error * revise naive function to support only numpy.ndarray or callable function. update test functions * add param to performant multi_mass and fix issues * add test function for multi_distance_profile, expecting error * add param isconstant to private func and fix issues * fix decorator * update mstumped * add new test functions, and revise naive function * add new test function for mstump with param isconstant, expecting error * add param to performant mstump and fix issues * preprocessT in the beginning to ensure the input becomes np.ndarray even if provided as df * minor changes * add param isconstant to subpace and mdl * fix minor bug * fix decorator * remove trailing colon * replace reshape with expand_dims to improve readability * minor changes * add test for isconstant support in mstumped, expecting error * add support for isconstant, fixed error * add missing param p for p-norm support * improve docstrings * re-order elements of a list for sake of readability * Refactor * fix docstring * fix docstring * use numpy.expand_dims instead of reshape(-1, 1) to improve readability
1 parent c9a84f9 commit 029a6d2

6 files changed

Lines changed: 496 additions & 27 deletions

File tree

stumpy/floss.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def _iac(
109109
b_mean = np.round(np.mean(params[:, 1]), 2)
110110

111111
IAC = scipy.stats.beta.pdf(np.arange(width), a_mean, b_mean, loc=0, scale=width)
112-
slope, _, _, _ = np.linalg.lstsq(IAC.reshape(-1, 1), target_AC, rcond=None)
112+
slope, _, _, _ = np.linalg.lstsq(np.expand_dims(IAC, axis=1), target_AC, rcond=None)
113113

114114
IAC *= slope
115115

stumpy/mstump.py

Lines changed: 125 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,18 @@
1212
from .maamp import maamp, maamp_mdl, maamp_multi_distance_profile, maamp_subspace
1313

1414

15-
def _multi_mass(Q, T, m, M_T, Σ_T, μ_Q, σ_Q, T_subseq_isconstant):
15+
def _multi_mass(
16+
Q,
17+
T,
18+
m,
19+
M_T,
20+
Σ_T,
21+
μ_Q,
22+
σ_Q,
23+
T_subseq_isconstant,
24+
Q_subseq_isconstant,
25+
query_idx=None,
26+
):
1627
"""
1728
A multi-dimensional wrapper around "Mueen's Algorithm for Similarity Search"
1829
(MASS) to compute multi-dimensional distance profile.
@@ -43,6 +54,16 @@ def _multi_mass(Q, T, m, M_T, Σ_T, μ_Q, σ_Q, T_subseq_isconstant):
4354
T_subseq_isconstant : numpy.ndarray
4455
A boolean array that indicates whether a subsequence in `T` is constant (True)
4556
57+
Q_subseq_isconstant : numpy.ndarray
58+
A boolean array that indicates whether a subsequence in `Q` is constant (True)
59+
60+
query_idx : int, default None
61+
This is the index position along each of the time series in `T`, where
62+
the query subsequence, `Q`, is located. `query_idx` should be set to None
63+
if `Q` is not a subsequence of `T`. If `Q` is a subsequence of `T`, provding
64+
this argument is optional. If query_idx is provided, the distance between Q
65+
and `T[:, query_idx : query_idx + m]` will automatically be set to zero.
66+
4667
Returns
4768
-------
4869
D : numpy.ndarray
@@ -58,13 +79,19 @@ def _multi_mass(Q, T, m, M_T, Σ_T, μ_Q, σ_Q, T_subseq_isconstant):
5879
D[i, :] = np.inf
5980
else:
6081
D[i, :] = core.mass(
61-
Q[i], T[i], M_T[i], Σ_T[i], T_subseq_isconstant=T_subseq_isconstant[i]
82+
Q[i],
83+
T[i],
84+
M_T[i],
85+
Σ_T[i],
86+
T_subseq_isconstant=T_subseq_isconstant[i],
87+
Q_subseq_isconstant=Q_subseq_isconstant[i],
88+
query_idx=query_idx,
6289
)
6390

6491
return D
6592

6693

67-
@core.non_normalized(maamp_subspace)
94+
@core.non_normalized(maamp_subspace, exclude=["normalize", "T_subseq_isconstant"])
6895
def subspace(
6996
T,
7097
m,
@@ -77,6 +104,7 @@ def subspace(
77104
n_bit=8,
78105
normalize=True,
79106
p=2.0,
107+
T_subseq_isconstant=None,
80108
):
81109
"""
82110
Compute the k-dimensional matrix profile subspace for a given subsequence index and
@@ -141,6 +169,15 @@ def subspace(
141169
and the Euclidean distance, respectively. This parameter is ignored when
142170
`normalize == True`.
143171
172+
T_subseq_isconstant : numpy.ndarray, function, or list, default None
173+
A parameter that is used to show whether a subsequence of a time series in `T`
174+
is constant (True) or not. T_subseq_isconstant can be a 2D boolean numpy.ndarry
175+
or a function that can be applied to each time series in `T`. Alternatively, for
176+
maximum flexibility, a list (with length equal to the total number of time
177+
series) may also be used. In this case, T_subseq_isconstant[i] corresponds to
178+
the i-th time series T[i] and each element in the list can either be a 1D
179+
boolean np.ndarray, a function, or None.
180+
144181
Returns
145182
-------
146183
S : numpy.ndarray
@@ -177,15 +214,19 @@ def subspace(
177214
"""
178215
T = core._preprocess(T)
179216
core.check_window_size(m, max_size=T.shape[-1])
217+
T_subseq_isconstant = core.process_isconstant(T, m, T_subseq_isconstant)
180218

181219
if discretize_func is None:
182220
bins = _inverse_norm(n_bit)
183221
discretize_func = partial(_discretize, bins=bins)
184222

185223
subseqs, _, _, _ = core.preprocess(T[:, subseq_idx : subseq_idx + m], m)
186224
subseqs = core.z_norm(subseqs, axis=1)
225+
subseqs[T_subseq_isconstant[:, subseq_idx]] = 0.0
226+
187227
neighbors, _, _, _ = core.preprocess(T[:, nn_idx : nn_idx + m], m)
188228
neighbors = core.z_norm(neighbors, axis=1)
229+
neighbors[T_subseq_isconstant[:, nn_idx]] = 0.0
189230

190231
disc_subseqs = discretize_func(subseqs)
191232
disc_neighbors = discretize_func(neighbors)
@@ -243,7 +284,7 @@ def _discretize(a, bins, right=True): # pragma: no cover
243284
return np.digitize(a, bins, right=right)
244285

245286

246-
@core.non_normalized(maamp_mdl)
287+
@core.non_normalized(maamp_mdl, exclude=["normalize", "T_subseq_isconstant"])
247288
def mdl(
248289
T,
249290
m,
@@ -255,6 +296,7 @@ def mdl(
255296
n_bit=8,
256297
normalize=True,
257298
p=2.0,
299+
T_subseq_isconstant=None,
258300
):
259301
"""
260302
Compute the multi-dimensional number of bits needed to compress one
@@ -316,6 +358,15 @@ def mdl(
316358
and the Euclidean distance, respectively. This parameter is ignored when
317359
`normalize == True`.
318360
361+
T_subseq_isconstant : numpy.ndarray, function, or list, default None
362+
A parameter that is used to show whether a subsequence of a time series in `T`
363+
is constant (True) or not. T_subseq_isconstant can be a 2D boolean numpy.ndarry
364+
or a function that can be applied to each time series in `T`. Alternatively, for
365+
maximum flexibility, a list (with length equal to the total number of time
366+
series) may also be used. In this case, T_subseq_isconstant[i] corresponds to
367+
the i-th time series T[i] and each element in the list can either be a 1D
368+
boolean np.ndarray, a function, or None.
369+
319370
Returns
320371
-------
321372
bit_sizes : numpy.ndarray
@@ -352,6 +403,7 @@ def mdl(
352403
"""
353404
T = core._preprocess(T)
354405
core.check_window_size(m, max_size=T.shape[-1])
406+
T_subseq_isconstant = core.process_isconstant(T, m, T_subseq_isconstant)
355407

356408
if discretize_func is None:
357409
bins = _inverse_norm(n_bit)
@@ -362,8 +414,11 @@ def mdl(
362414
for k in range(T.shape[0]):
363415
subseqs, _, _, _ = core.preprocess(T[:, subseq_idx[k] : subseq_idx[k] + m], m)
364416
subseqs = core.z_norm(subseqs, axis=1)
417+
subseqs[T_subseq_isconstant[:, subseq_idx[k]]] = 0.0
418+
365419
neighbors, _, _, _ = core.preprocess(T[:, nn_idx[k] : nn_idx[k] + m], m)
366420
neighbors = core.z_norm(neighbors, axis=1)
421+
neighbors[T_subseq_isconstant[:, nn_idx[k]]] = 0.0
367422

368423
disc_subseqs = discretize_func(subseqs)
369424
disc_neighbors = discretize_func(neighbors)
@@ -387,6 +442,7 @@ def _multi_distance_profile(
387442
μ_Q,
388443
σ_Q,
389444
T_subseq_isconstant,
445+
Q_subseq_isconstant,
390446
include=None,
391447
discords=False,
392448
excl_zone=None,
@@ -399,7 +455,7 @@ def _multi_distance_profile(
399455
Parameters
400456
----------
401457
query_idx : int
402-
The window index to calculate the multi-dimensional distance profile for
458+
The start index of the (multi-dimensional) query subsequence in `T_B`
403459
404460
T_A : numpy.ndarray
405461
The time series or sequence for which the multi-dimensional distance profile
@@ -426,6 +482,10 @@ def _multi_distance_profile(
426482
T_subseq_isconstant : numpy.ndarray
427483
A boolean array that indicates whether a subsequence in `T_A` is constant (True)
428484
485+
Q_subseq_isconstant : numpy.ndarray
486+
A boolean array that indicates whether a subsequence in `T_B` is
487+
constant (True)
488+
429489
include : numpy.ndarray, default None
430490
A list of (zero-based) indices corresponding to the dimensions in `T` that
431491
must be included in the constrained multidimensional motif search.
@@ -450,6 +510,7 @@ def _multi_distance_profile(
450510
d, n = T_A.shape
451511
k = n - m + 1
452512
start_row_idx = 0
513+
453514
D = _multi_mass(
454515
T_B[:, query_idx : query_idx + m],
455516
T_A,
@@ -459,6 +520,8 @@ def _multi_distance_profile(
459520
μ_Q[:, query_idx],
460521
σ_Q[:, query_idx],
461522
T_subseq_isconstant,
523+
np.expand_dims(Q_subseq_isconstant[:, query_idx], 1),
524+
query_idx=query_idx,
462525
)
463526

464527
if include is not None:
@@ -481,9 +544,18 @@ def _multi_distance_profile(
481544
return D
482545

483546

484-
@core.non_normalized(maamp_multi_distance_profile)
547+
@core.non_normalized(
548+
maamp_multi_distance_profile, exclude=["normalize", "T_subseq_isconstant"]
549+
)
485550
def multi_distance_profile(
486-
query_idx, T, m, include=None, discords=False, normalize=True, p=2.0
551+
query_idx,
552+
T,
553+
m,
554+
include=None,
555+
discords=False,
556+
normalize=True,
557+
p=2.0,
558+
T_subseq_isconstant=None,
487559
):
488560
"""
489561
Multi-dimensional wrapper to compute the multi-dimensional distance profile for a
@@ -525,13 +597,24 @@ def multi_distance_profile(
525597
and the Euclidean distance, respectively. This parameter is ignored when
526598
`normalize == True`.
527599
600+
T_subseq_isconstant : numpy.ndarray, function, or list, default None
601+
A parameter that is used to show whether a subsequence of a time series in `T`
602+
is constant (True) or not. T_subseq_isconstant can be a 2D boolean numpy.ndarry
603+
or a function that can be applied to each time series in `T`. Alternatively, for
604+
maximum flexibility, a list (with length equal to the total number of time
605+
series) may also be used. In this case, T_subseq_isconstant[i] corresponds to
606+
the i-th time series T[i] and each element in the list can either be a 1D
607+
boolean np.ndarray, a function, or None.
608+
528609
Returns
529610
-------
530611
D : numpy.ndarray
531612
Multi-dimensional distance profile for the window with index equal to
532613
`query_idx`
533614
"""
534-
T, M_T, Σ_T, T_subseq_isconstant = core.preprocess(T, m)
615+
T, M_T, Σ_T, T_subseq_isconstant = core.preprocess(
616+
T, m, T_subseq_isconstant=T_subseq_isconstant
617+
)
535618

536619
if T.ndim <= 1: # pragma: no cover
537620
err = f"T is {T.ndim}-dimensional and must be at least 1-dimensional"
@@ -556,6 +639,7 @@ def multi_distance_profile(
556639
M_T,
557640
Σ_T,
558641
T_subseq_isconstant,
642+
T_subseq_isconstant,
559643
include,
560644
discords,
561645
excl_zone,
@@ -575,6 +659,7 @@ def _get_first_mstump_profile(
575659
μ_Q,
576660
σ_Q,
577661
T_subseq_isconstant,
662+
Q_subseq_isconstant,
578663
include=None,
579664
discords=False,
580665
):
@@ -621,6 +706,10 @@ def _get_first_mstump_profile(
621706
T_subseq_isconstant : numpy.ndarray
622707
A boolean array that indicates whether a subsequence in `T_A` is constant (True)
623708
709+
Q_subseq_isconstant : numpy.ndarray
710+
A boolean array that indicates whether a (query) subsequence in `T_B` is
711+
constant (True)
712+
624713
include : numpy.ndarray, default None
625714
A list of (zero-based) indices corresponding to the dimensions in `T` that
626715
must be included in the constrained multidimensional motif search.
@@ -653,6 +742,7 @@ def _get_first_mstump_profile(
653742
μ_Q,
654743
σ_Q,
655744
T_subseq_isconstant,
745+
Q_subseq_isconstant,
656746
include,
657747
discords,
658748
excl_zone,
@@ -1013,8 +1103,10 @@ def _mstump(
10131103
return P, I
10141104

10151105

1016-
@core.non_normalized(maamp)
1017-
def mstump(T, m, include=None, discords=False, normalize=True, p=2.0):
1106+
@core.non_normalized(maamp, exclude=["normalize", "T_subseq_isconstant"])
1107+
def mstump(
1108+
T, m, include=None, discords=False, normalize=True, p=2.0, T_subseq_isconstant=None
1109+
):
10181110
"""
10191111
Compute the multi-dimensional z-normalized matrix profile
10201112
@@ -1059,6 +1151,15 @@ def mstump(T, m, include=None, discords=False, normalize=True, p=2.0):
10591151
and the Euclidean distance, respectively. This parameter is ignored when
10601152
`normalize == True`.
10611153
1154+
T_subseq_isconstant : numpy.ndarray, function, or list, default None
1155+
A parameter that is used to show whether a subsequence of a time series in `T`
1156+
is constant (True) or not. T_subseq_isconstant can be a 2D boolean numpy.ndarry
1157+
or a function that can be applied to each time series in `T`. Alternatively, for
1158+
maximum flexibility, a list (with length equal to the total number of time
1159+
series) may also be used. In this case, T_subseq_isconstant[i] corresponds to
1160+
the i-th time series T[i] and each element in the list can either be a 1D
1161+
boolean np.ndarray, a function, or None.
1162+
10621163
Returns
10631164
-------
10641165
P : numpy.ndarray
@@ -1100,8 +1201,19 @@ def mstump(T, m, include=None, discords=False, normalize=True, p=2.0):
11001201
T_A = T
11011202
T_B = T_A
11021203

1103-
T_A, M_T, Σ_T, T_subseq_isconstant = core.preprocess(T_A, m)
1104-
T_B, μ_Q, σ_Q, Q_subseq_isconstant = core.preprocess(T_B, m)
1204+
T_A = core._preprocess(T_A)
1205+
T_B = core._preprocess(T_B)
1206+
1207+
T_A_subseq_isconstant = T_subseq_isconstant
1208+
T_A_subseq_isconstant = core.process_isconstant(T_A, m, T_A_subseq_isconstant)
1209+
T_B_subseq_isconstant = T_A_subseq_isconstant
1210+
1211+
T_A, M_T, Σ_T, T_subseq_isconstant = core.preprocess(
1212+
T_A, m, T_subseq_isconstant=T_A_subseq_isconstant
1213+
)
1214+
T_B, μ_Q, σ_Q, Q_subseq_isconstant = core.preprocess(
1215+
T_B, m, T_subseq_isconstant=T_B_subseq_isconstant
1216+
)
11051217

11061218
if T_A.ndim <= 1: # pragma: no cover
11071219
err = f"T is {T_A.ndim}-dimensional and must be at least 1-dimensional"
@@ -1135,6 +1247,7 @@ def mstump(T, m, include=None, discords=False, normalize=True, p=2.0):
11351247
μ_Q,
11361248
σ_Q,
11371249
T_subseq_isconstant,
1250+
Q_subseq_isconstant,
11381251
include,
11391252
discords,
11401253
)

0 commit comments

Comments
 (0)