Skip to content

Commit b4a1133

Browse files
committed
Improved include handling when computing subspace
1 parent 6d58e54 commit b4a1133

4 files changed

Lines changed: 131 additions & 123 deletions

File tree

docs/Tutorial_Multidimensional_Motif_Discovery.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -368,13 +368,13 @@
368368
" motif_idx = np.argmin(mp[:, k])\n",
369369
" motif_mp_val = np.round(mp[motif_idx, k], 2)\n",
370370
" nn_idx = indices[motif_idx, k]\n",
371-
" mp_subspace = _get_subspace(df, m, motif_idx, nn_idx)[k]\n",
371+
" mp_subspace = _get_subspace(df, m, motif_idx, nn_idx, k)\n",
372372
" print(f\"k-Dims: {k + 1}, Motif Idx: {motif_idx}, Nearest Neighbor Idx: {nn_idx}, Motif MP Value: {motif_mp_val}, Subspace(s) {mp_subspace}\")"
373373
]
374374
},
375375
{
376376
"cell_type": "code",
377-
"execution_count": 11,
377+
"execution_count": 10,
378378
"metadata": {},
379379
"outputs": [
380380
{

stumpy/mstump.py

Lines changed: 70 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -40,53 +40,6 @@ def _preprocess_include(include):
4040
return include
4141

4242

43-
def _multi_mass(Q, T, m, M_T, Σ_T, μ_Q, σ_Q):
44-
"""
45-
A multi-dimensional wrapper around "Mueen's Algorithm for Similarity Search"
46-
(MASS) to compute multi-dimensional distance profile.
47-
48-
Parameters
49-
----------
50-
Q : ndarray
51-
Query array or subsequence
52-
53-
T : ndarray
54-
Time series array or sequence
55-
56-
m : int
57-
Window size
58-
59-
M_T : ndarray
60-
Sliding mean for `T_A`
61-
62-
Σ_T : ndarray
63-
Sliding standard deviation for `T_A`
64-
65-
μ_Q : ndarray
66-
Mean value of `Q`
67-
68-
σ_Q : ndarray
69-
Standard deviation of `Q`
70-
71-
Returns
72-
-------
73-
D : ndarray
74-
Multi-dimensional distance profile
75-
"""
76-
d, n = T.shape
77-
k = n - m + 1
78-
79-
D = np.empty((d, k), dtype="float64")
80-
81-
for i in range(d):
82-
if np.isinf(μ_Q[i]):
83-
D[i, :] = np.inf
84-
else:
85-
D[i, :] = core.mass(Q[i], T[i], M_T[i], Σ_T[i])
86-
87-
return D
88-
89-
9043
def _apply_include(
9144
D,
9245
include,
@@ -148,7 +101,54 @@ def _apply_include(
148101
D[unrestricted_indices] = tmp_swap[mask]
149102

150103

151-
def _get_subspace(T, m, motif_idx, nn_idx, include=None, discords=False):
104+
def _multi_mass(Q, T, m, M_T, Σ_T, μ_Q, σ_Q):
105+
"""
106+
A multi-dimensional wrapper around "Mueen's Algorithm for Similarity Search"
107+
(MASS) to compute multi-dimensional distance profile.
108+
109+
Parameters
110+
----------
111+
Q : ndarray
112+
Query array or subsequence
113+
114+
T : ndarray
115+
Time series array or sequence
116+
117+
m : int
118+
Window size
119+
120+
M_T : ndarray
121+
Sliding mean for `T_A`
122+
123+
Σ_T : ndarray
124+
Sliding standard deviation for `T_A`
125+
126+
μ_Q : ndarray
127+
Mean value of `Q`
128+
129+
σ_Q : ndarray
130+
Standard deviation of `Q`
131+
132+
Returns
133+
-------
134+
D : ndarray
135+
Multi-dimensional distance profile
136+
"""
137+
d, n = T.shape
138+
k = n - m + 1
139+
140+
D = np.empty((d, k), dtype="float64")
141+
142+
for i in range(d):
143+
if np.isinf(μ_Q[i]):
144+
D[i, :] = np.inf
145+
else:
146+
D[i, :] = core.mass(Q[i], T[i], M_T[i], Σ_T[i])
147+
148+
return D
149+
150+
151+
def _get_subspace(T, m, motif_idx, nn_idx, k, include=None, discords=False):
152152
"""
153153
Compute the multi-dimensional matrix profile subspace for a given motif index and
154154
its nearest neighbor index
@@ -168,6 +168,10 @@ def _get_subspace(T, m, motif_idx, nn_idx, include=None, discords=False):
168168
nn_idx : int
169169
The nearest neighbor index in T
170170
171+
k : int
172+
The subset number of dimensions out of `D = T.shape[0]`-dimensions to return
173+
the subspace for
174+
171175
include : ndarray, default None
172176
A list of (zero-based) indices corresponding to the dimensions in `T` that
173177
must be included in the constrained multidimensional motif search.
@@ -183,35 +187,32 @@ def _get_subspace(T, m, motif_idx, nn_idx, include=None, discords=False):
183187
Returns
184188
-------
185189
S : ndarray
186-
A ragged array of ndarrays that contain the multi-dimensional subspace for the
187-
window with index equal to `query_idx`. The `len(S)` will be equal to the total
188-
number of dimensions, `d`, and where `S[i]` corresponds to the list of subspace
189-
indices for the `i+1`th subspace dimension (i.e., `S[1]` corresponds to the
190-
subspace with dimension `2` and with `len(S[1]) == 2`).
190+
An array of that contains the `k`th-dimensional subspace for the subsequence
191+
with index equal to `motif_idx`
191192
"""
192193
T, _, _ = core.preprocess(T, m)
193194

194-
S = np.empty(T.shape[0], dtype=object)
195-
D = np.linalg.norm(
196-
core.z_norm(T[:, motif_idx : motif_idx + m], axis=1)
197-
- core.z_norm(T[:, nn_idx : nn_idx + m], axis=1),
198-
axis=1,
199-
)
200-
201-
if include is not None:
202-
include = _preprocess_include(include)
203-
else:
204-
include = []
195+
motif = core.z_norm(T[:, motif_idx : motif_idx + m], axis=1)
196+
neighbor = core.z_norm(T[:, nn_idx : nn_idx + m], axis=1)
197+
D = np.linalg.norm(motif - neighbor, axis=1)
205198

206199
if discords:
207-
D[include] = np.inf
208200
sorted_idx = D[::-1].argsort(axis=0, kind="mergesort")
209201
else:
210-
D[include] = 0.0
211202
sorted_idx = D.argsort(axis=0, kind="mergesort")
212203

213-
for k in range(T.shape[0]):
214-
S[k] = sorted_idx[: k + 1]
204+
# `include` processing occur here since we are dealing with indices, not distances
205+
if include is not None:
206+
include = _preprocess_include(include)
207+
mask = np.in1d(sorted_idx, include)
208+
include_idx = mask.nonzero()[0]
209+
exclude_idx = (~mask).nonzero()[0]
210+
sorted_idx[: include_idx.shape[0]], sorted_idx[include_idx.shape[0] :] = (
211+
sorted_idx[include_idx],
212+
sorted_idx[exclude_idx],
213+
)
214+
215+
S = sorted_idx[: k + 1]
215216

216217
return S
217218

@@ -292,7 +293,6 @@ def _query_mstump_profile(
292293
)
293294

294295
if include is not None:
295-
include = _preprocess_include(include)
296296
_apply_include(D, include)
297297
start_row_idx = include.shape[0]
298298

@@ -700,6 +700,7 @@ def _mstump(
700700
σ_Q,
701701
)
702702

703+
# `include` processing must occur here since we are dealing with distances
703704
if include is not None:
704705
_apply_include(
705706
D,

tests/naive.py

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import numpy as np
33
from scipy.spatial.distance import cdist
44
from stumpy import core
5-
from stumpy.mstump import _apply_include
65

76

87
def z_norm(a, axis=0, threshold=1e-7):
@@ -235,6 +234,26 @@ def PI(D, trivial_idx, excl_zone):
235234
return P, I
236235

237236

237+
def apply_include(D, include):
238+
restricted_indices = []
239+
unrestricted_indices = []
240+
mask = np.ones(include.shape[0], bool)
241+
242+
for i in range(include.shape[0]):
243+
if include[i] < include.shape[0]:
244+
restricted_indices.append(include[i])
245+
if include[i] >= include.shape[0]:
246+
unrestricted_indices.append(include[i])
247+
248+
restricted_indices = np.array(restricted_indices, dtype=np.int64)
249+
unrestricted_indices = np.array(unrestricted_indices, dtype=np.int64)
250+
mask[restricted_indices] = False
251+
tmp_swap = D[: include.shape[0]].copy()
252+
253+
D[: include.shape[0]] = D[include]
254+
D[unrestricted_indices] = tmp_swap[mask]
255+
256+
238257
def mstump(T, m, excl_zone, include=None, discords=False):
239258
T = T.copy()
240259

@@ -250,20 +269,12 @@ def mstump(T, m, excl_zone, include=None, discords=False):
250269

251270
start_row_idx = 0
252271
if include is not None:
253-
restricted_indices = include[include < include.shape[0]]
254-
unrestricted_indices = include[include >= include.shape[0]]
255-
mask = np.ones(include.shape[0], bool)
256-
mask[restricted_indices] = False
257-
tmp_swap = D[: include.shape[0]].copy()
258-
D[: include.shape[0]] = D[include]
259-
D[unrestricted_indices] = tmp_swap[mask]
272+
apply_include(D, include)
260273
start_row_idx = include.shape[0]
261274

262275
if discords:
263-
sorted_idx = D[start_row_idx:][::-1].argsort(axis=0, kind="mergesort")
264276
D[start_row_idx:][::-1].sort(axis=0)
265277
else:
266-
sorted_idx = D[start_row_idx:].argsort(axis=0, kind="mergesort")
267278
D[start_row_idx:].sort(axis=0)
268279

269280
D_prime = np.zeros(n - m + 1)
@@ -284,26 +295,34 @@ def mstump(T, m, excl_zone, include=None, discords=False):
284295
return P.T, I.T
285296

286297

287-
def subspace(T, m, motif_idx, nn_idx, include=None, discords=False):
288-
S = np.empty(T.shape[0], dtype=object)
298+
def subspace(T, m, motif_idx, nn_idx, k, include=None, discords=False):
289299
D = distance(
290300
z_norm(T[:, motif_idx : motif_idx + m], axis=1),
291301
z_norm(T[:, nn_idx : nn_idx + m], axis=1),
292302
axis=1,
293303
)
294304

295-
if include is None:
296-
include = []
297-
298305
if discords:
299-
D[include] = np.inf
300306
sorted_idx = D[::-1].argsort(axis=0, kind="mergesort")
301307
else:
302-
D[include] = 0.0
303308
sorted_idx = D.argsort(axis=0, kind="mergesort")
304309

305-
for k in range(T.shape[0]):
306-
S[k] = sorted_idx[: k + 1]
310+
# `include` processing can occur since we are dealing with indices, not distances
311+
if include is not None:
312+
include_idx = []
313+
for i in range(include.shape[0]):
314+
include_idx.append(np.isin(sorted_idx, include[i]).nonzero()[0])
315+
include_idx = np.array(include_idx).flatten()
316+
include_idx.sort()
317+
exclude_idx = np.ones(T.shape[0], dtype=bool)
318+
exclude_idx[include_idx] = False
319+
exclude_idx = exclude_idx.nonzero()[0]
320+
sorted_idx[: include_idx.shape[0]], sorted_idx[include_idx.shape[0] :] = (
321+
sorted_idx[include_idx],
322+
sorted_idx[exclude_idx],
323+
)
324+
325+
S = sorted_idx[: k + 1]
307326

308327
return S
309328

0 commit comments

Comments
 (0)