Skip to content

Commit 6d58e54

Browse files
committed
Simplified subspace
1 parent f0c36d3 commit 6d58e54

7 files changed

Lines changed: 220 additions & 88 deletions

docs/Tutorial_Multidimensional_Motif_Discovery.ipynb

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@
348348
},
349349
{
350350
"cell_type": "code",
351-
"execution_count": 10,
351+
"execution_count": 9,
352352
"metadata": {},
353353
"outputs": [
354354
{
@@ -362,26 +362,13 @@
362362
}
363363
],
364364
"source": [
365-
"from stumpy.mstump import _query_mstump_profile\n",
366-
"from stumpy import core\n",
367-
"\n",
368-
"def subspace(T, m, T_idx, k, include=None, discords=False, M_T=None, Σ_T=None):\n",
369-
" if M_T is None or Σ_T is None:\n",
370-
" T, M_T, Σ_T = stumpy.core.preprocess(T, m)\n",
371-
"\n",
372-
" excl_zone = int(np.ceil(m / 4))\n",
373-
"\n",
374-
" P, I, S = _query_mstump_profile(\n",
375-
" T_idx, T, T, m, excl_zone, M_T, Σ_T, M_T, Σ_T, include, discords\n",
376-
" )\n",
377-
" \n",
378-
" return S[k]\n",
365+
"from stumpy.mstump import _get_subspace\n",
379366
"\n",
380367
"for k in range(df.shape[1]):\n",
381368
" motif_idx = np.argmin(mp[:, k])\n",
382369
" motif_mp_val = np.round(mp[motif_idx, k], 2)\n",
383370
" nn_idx = indices[motif_idx, k]\n",
384-
" mp_subspace = subspace(df, m, motif_idx, k)\n",
371+
" mp_subspace = _get_subspace(df, m, motif_idx, nn_idx)[k]\n",
385372
" print(f\"k-Dims: {k + 1}, Motif Idx: {motif_idx}, Nearest Neighbor Idx: {nn_idx}, Motif MP Value: {motif_mp_val}, Subspace(s) {mp_subspace}\")"
386373
]
387374
},

stumpy/mstump.py

Lines changed: 112 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,34 @@
1212
logger = logging.getLogger(__name__)
1313

1414

15+
def _preprocess_include(include):
16+
"""
17+
A utility function for processing the `include` input
18+
19+
Parameters
20+
----------
21+
include : ndarray
22+
A list of (zero-based) indices corresponding to the dimensions in `T` that
23+
must be included in the constrained multidimensional motif search.
24+
For more information, see Section IV D in:
25+
26+
`DOI: 10.1109/ICDM.2017.66 \
27+
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
28+
29+
Returns
30+
-------
31+
include : ndarray
32+
Process `include` and remove any redundant index values
33+
"""
34+
include = np.asarray(include)
35+
_, idx = np.unique(include, return_index=True)
36+
if include.shape[0] != idx.shape[0]: # pragma: no cover
37+
logger.warning("Removed repeating indices in `include`")
38+
include = include[np.sort(idx)]
39+
40+
return include
41+
42+
1543
def _multi_mass(Q, T, m, M_T, Σ_T, μ_Q, σ_Q):
1644
"""
1745
A multi-dimensional wrapper around "Mueen's Algorithm for Similarity Search"
@@ -99,6 +127,8 @@ def _apply_include(
99127
tmp_swap : ndarray, default None
100128
A reusable array to aid in array element swapping
101129
"""
130+
include = _preprocess_include(include)
131+
102132
if restricted_indices is None:
103133
restricted_indices = include[include < include.shape[0]]
104134

@@ -118,27 +148,92 @@ def _apply_include(
118148
D[unrestricted_indices] = tmp_swap[mask]
119149

120150

151+
def _get_subspace(T, m, motif_idx, nn_idx, include=None, discords=False):
152+
"""
153+
Compute the multi-dimensional matrix profile subspace for a given motif index and
154+
its nearest neighbor index
155+
156+
Parameters
157+
----------
158+
T : ndarray
159+
The time series or sequence for which the multi-dimensional matrix profile,
160+
multi-dimensional matrix profile indices were computed
161+
162+
m : int
163+
Window size
164+
165+
motif_idx : int
166+
The motif index in T
167+
168+
nn_idx : int
169+
The nearest neighbor index in T
170+
171+
include : ndarray, default None
172+
A list of (zero-based) indices corresponding to the dimensions in `T` that
173+
must be included in the constrained multidimensional motif search.
174+
For more information, see Section IV D in:
175+
176+
`DOI: 10.1109/ICDM.2017.66 \
177+
<https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__
178+
179+
discords : bool, default False
180+
When set to `True`, this reverses the distance profile to favor discords rather
181+
than motifs. Note that indices in `include` are still maintained and respected.
182+
183+
Returns
184+
-------
185+
S : ndarray
186+
A ragged array of ndarrays that contain the multi-dimensional subspace for the
187+
window with index equal to `query_idx`. The `len(S)` will be equal to the total
188+
number of dimensions, `d`, and where `S[i]` corresponds to the list of subspace
189+
indices for the `i+1`th subspace dimension (i.e., `S[1]` corresponds to the
190+
subspace with dimension `2` and with `len(S[1]) == 2`).
191+
"""
192+
T, _, _ = core.preprocess(T, m)
193+
194+
S = np.empty(T.shape[0], dtype=object)
195+
D = np.linalg.norm(
196+
core.z_norm(T[:, motif_idx : motif_idx + m], axis=1)
197+
- core.z_norm(T[:, nn_idx : nn_idx + m], axis=1),
198+
axis=1,
199+
)
200+
201+
if include is not None:
202+
include = _preprocess_include(include)
203+
else:
204+
include = []
205+
206+
if discords:
207+
D[include] = np.inf
208+
sorted_idx = D[::-1].argsort(axis=0, kind="mergesort")
209+
else:
210+
D[include] = 0.0
211+
sorted_idx = D.argsort(axis=0, kind="mergesort")
212+
213+
for k in range(T.shape[0]):
214+
S[k] = sorted_idx[: k + 1]
215+
216+
return S
217+
218+
121219
def _query_mstump_profile(
122220
query_idx, T_A, T_B, m, excl_zone, M_T, Σ_T, μ_Q, σ_Q, include=None, discords=False
123221
):
124222
"""
125-
Multi-dimensional wrapper to compute the multi-dimensional matrix profile,
126-
the multi-dimensional matrix profile index, the multi-dimensional matrix profile
127-
subspace for a given query window within the times series or sequence that is
128-
denoted by the `query_idx` index. Essentially, this is a convenience wrapper around
129-
`_multi_mass`.
223+
Multi-dimensional wrapper to compute the multi-dimensional matrix profile and
224+
the multi-dimensional matrix profile index for a given query window within the times
225+
series or sequence that is denoted by the `query_idx` index. Essentially, this is a
226+
convenience wrapper around `_multi_mass`.
130227
131228
Parameters
132229
----------
133230
query_idx : int
134-
The window index to calculate the first multi-dimensional matrix profile,
135-
multi-dimensional matrix profile indices, and multi-dimensional matrix profile
136-
subspace.
231+
The window index to calculate the first multi-dimensional matrix profile and
232+
multi-dimensional matrix profile indices
137233
138234
T_A : ndarray
139-
The time series or sequence for which the multi-dimensional matrix profile,
140-
multi-dimensional matrix profile indices, and multi-dimensional matrix profile
141-
subspace will be returned
235+
The time series or sequence for which the multi-dimensional matrix profile and
236+
multi-dimensional matrix profile indices
142237
143238
T_B : ndarray
144239
The time series or sequence that contains your query subsequences
@@ -182,13 +277,6 @@ def _query_mstump_profile(
182277
I : ndarray
183278
Multi-dimensional matrix profile indices for the window with index
184279
equal to `query_idx`
185-
186-
S : ndarray
187-
A ragged array of ndarrays that contain the multi-dimensional subspace for the
188-
window with index equal to `query_idx`. The `len(S)` will be equal to the total
189-
number of dimensions, `d`, and where `S[i]` corresponds to the list of subspace
190-
indices for the `i+1`th subspace dimension (i.e., `S[1]` corresponds to the
191-
subspace with dimension `2` and with `len(S[1]) == 2`).
192280
"""
193281
d, n = T_A.shape
194282
k = n - m + 1
@@ -204,19 +292,14 @@ def _query_mstump_profile(
204292
)
205293

206294
if include is not None:
295+
include = _preprocess_include(include)
207296
_apply_include(D, include)
208297
start_row_idx = include.shape[0]
209298

210299
if discords:
211-
# D[start_row_idx:][::-1].sort(axis=0)
212-
sorted_idx = D[start_row_idx:][::-1].argsort(axis=0, kind="mergesort")
213-
broadcast_idx = np.arange(D[start_row_idx:].shape[1])[np.newaxis, :]
214-
D[start_row_idx:][::-1] = D[start_row_idx:][::-1][sorted_idx, broadcast_idx]
300+
D[start_row_idx:][::-1].sort(axis=0, kind="mergesort")
215301
else:
216-
# D[start_row_idx:].sort(axis=0)
217-
sorted_idx = D[start_row_idx:].argsort(axis=0, kind="mergesort")
218-
broadcast_idx = np.arange(D[start_row_idx:].shape[1])[np.newaxis, :]
219-
D[start_row_idx:] = D[start_row_idx:][sorted_idx, broadcast_idx]
302+
D[start_row_idx:].sort(axis=0, kind="mergesort")
220303

221304
D_prime = np.zeros(k)
222305
for i in range(d):
@@ -227,18 +310,15 @@ def _query_mstump_profile(
227310

228311
P = np.full(d, np.inf, dtype="float64")
229312
I = np.full(d, -1, dtype="int64")
230-
S = np.empty(d, dtype=object)
231313

232314
for i in range(d):
233315
min_index = np.argmin(D[i])
234316
I[i] = min_index
235317
P[i] = D[i, min_index]
236-
S[i] = sorted_idx[: i + 1, min_index]
237318
if np.isinf(P[i]): # pragma nocover
238319
I[i] = -1
239-
S[i][:] = -1
240320

241-
return P, I, S
321+
return P, I
242322

243323

244324
def _get_first_mstump_profile(
@@ -306,7 +386,7 @@ def _get_first_mstump_profile(
306386
Multi-dimensional matrix profile indices for the window with index
307387
equal to `start`
308388
"""
309-
P, I, _ = _query_mstump_profile(
389+
P, I = _query_mstump_profile(
310390
start, T_A, T_B, m, excl_zone, M_T, Σ_T, μ_Q, σ_Q, include, discords
311391
)
312392
return P, I
@@ -706,11 +786,7 @@ def mstump(T, m, include=None, discords=False):
706786
core.check_window_size(m)
707787

708788
if include is not None:
709-
include = np.asarray(include)
710-
_, idx = np.unique(include, return_index=True)
711-
if include.shape[0] != idx.shape[0]: # pragma: no cover
712-
logger.warning("Removed repeating indices in `include`")
713-
include = include[np.sort(idx)]
789+
include = _preprocess_include(include)
714790

715791
d, n = T_B.shape
716792
k = n - m + 1

tests/naive.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import numpy as np
33
from scipy.spatial.distance import cdist
44
from stumpy import core
5+
from stumpy.mstump import _apply_include
56

67

78
def z_norm(a, axis=0, threshold=1e-7):
@@ -242,10 +243,6 @@ def mstump(T, m, excl_zone, include=None, discords=False):
242243

243244
P = np.full((d, k), np.inf)
244245
I = np.ones((d, k), dtype="int64") * -1
245-
S = np.empty((d, k), dtype=object)
246-
for i in range(d):
247-
for j in range(k):
248-
S[i, j] = -np.ones(i + 1, dtype=np.int64)
249246

250247
for i in range(k):
251248
Q = T[:, i : i + m]
@@ -280,14 +277,35 @@ def mstump(T, m, excl_zone, include=None, discords=False):
280277
P_i, I_i = PI(D_prime_prime, i, excl_zone)
281278

282279
for dim in range(T.shape[0]):
283-
for col_idx in range(P.shape[1]):
284-
if P[dim, col_idx] > P_i[dim, col_idx]:
285-
S[dim, col_idx] = sorted_idx[: dim + 1, col_idx]
286280
col_mask = P[dim] > P_i[dim]
287281
P[dim, col_mask] = P_i[dim, col_mask]
288282
I[dim, col_mask] = I_i[dim, col_mask]
289283

290-
return P.T, I.T, S.T
284+
return P.T, I.T
285+
286+
287+
def subspace(T, m, motif_idx, nn_idx, include=None, discords=False):
288+
S = np.empty(T.shape[0], dtype=object)
289+
D = distance(
290+
z_norm(T[:, motif_idx : motif_idx + m], axis=1),
291+
z_norm(T[:, nn_idx : nn_idx + m], axis=1),
292+
axis=1,
293+
)
294+
295+
if include is None:
296+
include = []
297+
298+
if discords:
299+
D[include] = np.inf
300+
sorted_idx = D[::-1].argsort(axis=0, kind="mergesort")
301+
else:
302+
D[include] = 0.0
303+
sorted_idx = D.argsort(axis=0, kind="mergesort")
304+
305+
for k in range(T.shape[0]):
306+
S[k] = sorted_idx[: k + 1]
307+
308+
return S
291309

292310

293311
def get_array_ranges(a, n_chunks, truncate=False):

0 commit comments

Comments
 (0)