Skip to content

Commit 1a6b276

Browse files
committed
feat: add matrix word search and basic BPE algorithm
Signed-off-by: Ayush Joshi <ayush854032@gmail.com>
1 parent b320274 commit 1a6b276

10 files changed

Lines changed: 2460 additions & 0 deletions

File tree

algos/matrix/word_search.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""Given an `m x n` grid of characters board and a string word, return true if word
2+
exists in the grid.
3+
4+
The word can be constructed from letters of sequentially adjacent cells, where adjacent
5+
cells are horizontally or vertically neighboring. The same letter cell may not be used
6+
more than once.
7+
"""
8+
9+
from typing import List
10+
11+
12+
class Solution:
13+
def _is_in_bounds(self, i: int, j: int, rows: int, cols: int) -> bool:
14+
return 0 <= i < rows and 0 <= j < cols
15+
16+
def exist(self, board: List[List[str]], word: str) -> bool:
17+
rows, cols = len(board), len(board[0])
18+
19+
def dfs(i: int, j: int, k: int) -> bool:
20+
if not self._is_in_bounds(i, j, rows, cols) or not board[i][j] == word[k]:
21+
return False
22+
23+
if k == len(word) - 1:
24+
return True
25+
26+
temp = board[i][j]
27+
board[i][j] = "#"
28+
29+
found = (
30+
dfs(i - 1, j, k + 1)
31+
or dfs(i, j + 1, k + 1)
32+
or dfs(i + 1, j, k + 1)
33+
or dfs(i, j - 1, k + 1)
34+
)
35+
36+
board[i][j] = temp
37+
return found
38+
39+
for i in range(rows):
40+
for j in range(cols):
41+
if board[i][j] == word[0]:
42+
if dfs(i, j, 0):
43+
return True
44+
45+
return False

algos/strings/basic_tokenizer.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
from typing import List, Dict, Optional, Tuple
2+
3+
4+
class BasicTokenizer:
5+
6+
def __init__(self) -> None:
7+
self._merges = {}
8+
self._vocab = {}
9+
10+
def _get_consecutive_pair_stats(
11+
self, ids: List[int], counts: Optional[Dict] = None
12+
) -> Dict:
13+
"""Given a list of integers, return a dictionary of counts of
14+
consecutive pairs.
15+
16+
Optionally allows to update an existing dictionary of counts.
17+
18+
Example:
19+
>>> [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
20+
"""
21+
counts = counts or {}
22+
# iterate consecutive elements
23+
for i1, i2 in zip(ids, ids[1:]):
24+
counts[(i1, i2)] = counts.get((i1, i2), 0) + 1
25+
return counts
26+
27+
def _merge_consecutive_pair(
28+
self, ids: List[int], pair: Tuple[int, int], idx: int
29+
) -> List[int]:
30+
"""In the list of integers (ids), replace all consecutive occurrences
31+
of pair with the new integer token idx.
32+
33+
Example:
34+
>>> ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
35+
"""
36+
out = []
37+
i = 0
38+
while i < len(ids):
39+
# if not at the very last position AND the pair matches, replace it
40+
if ids[i] == pair[0] and i < len(ids) - 1 and ids[i + 1] == pair[1]:
41+
out.append(idx)
42+
i += 2
43+
else:
44+
out.append(ids[i])
45+
i += 1
46+
return out
47+
48+
def train(self, text: str, vocab_size: int = 256) -> None:
49+
assert vocab_size >= 256
50+
n_merges = vocab_size - 256
51+
52+
ids = list(text.encode("utf-8"))
53+
self._vocab = {idx: bytes([idx]) for idx in range(256)}
54+
# iteratively merge the most common pairs to create new
55+
# tokens; (int, int) -> int
56+
for i in range(n_merges):
57+
consecutive_pair_stats = self._get_consecutive_pair_stats(ids)
58+
if not consecutive_pair_stats:
59+
break
60+
(i1, i2) = max(consecutive_pair_stats, key=consecutive_pair_stats.get)
61+
# mint a new token: assign it the next available id
62+
idx = 256 + i
63+
ids = self._merge_consecutive_pair(ids, (i1, i2), idx)
64+
self._merges[(i1, i2)] = idx
65+
self._vocab[idx] = self._vocab[i1] + self._vocab[i2]
66+
67+
def encode(self, text: str) -> List[int]:
68+
ids = list(text.encode("utf-8"))
69+
while len(ids) >= 2:
70+
consecutive_pair_stats = self._get_consecutive_pair_stats(ids)
71+
(i1, i2) = min(
72+
consecutive_pair_stats, key=lambda p: self._merges.get(p, float("inf"))
73+
)
74+
# if we did not learn this pair then break
75+
if (i1, i2) not in self._merges:
76+
break
77+
idx = self._merges[(i1, i2)]
78+
ids = self._merge_consecutive_pair(ids, (i1, i2), idx)
79+
return ids
80+
81+
def decode(self, ids: List[int]) -> str:
82+
text_bytes = b"".join(self._vocab[id] for id in ids)
83+
return text_bytes.decode("utf-8", errors="replace")
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
import pytest
2+
3+
from algos.strings.basic_tokenizer import BasicTokenizer
4+
5+
# Test data - various strings to test tokenization
6+
test_strings = [
7+
# empty string
8+
"",
9+
# single character
10+
"?",
11+
# Unicode and emoji
12+
"hello world!!!? (안녕하세요!) lol123 😉",
13+
# Wikipedia example
14+
"aaabdaaabac",
15+
# Common phrase
16+
"The quick brown fox jumps over the lazy dog",
17+
# Numbers only
18+
"1234567890",
19+
# Special characters only
20+
"!!!@@@###",
21+
]
22+
23+
24+
@pytest.mark.parametrize("text", test_strings)
25+
def test_encode_decode_identity(text):
26+
"""Test that encoding and decoding preserves the original text"""
27+
tokenizer = BasicTokenizer()
28+
29+
# Train with a reasonable vocabulary size
30+
tokenizer.train(text, vocab_size=512)
31+
32+
# Test encode/decode roundtrip
33+
encoded = tokenizer.encode(text)
34+
decoded = tokenizer.decode(encoded)
35+
36+
assert text == decoded, f"Roundtrip failed for: {text}"
37+
38+
39+
def test_wikipedia_example():
40+
"""Test the Wikipedia BPE example from the documentation"""
41+
tokenizer = BasicTokenizer()
42+
text = "aaabdaaabac"
43+
44+
# Train with exactly 3 merges (256 base + 3 new tokens)
45+
tokenizer.train(text, vocab_size=256 + 3)
46+
47+
# Encode should produce the expected token sequence
48+
encoded = tokenizer.encode(text)
49+
expected = [258, 100, 258, 97, 99] # Based on Wikipedia example
50+
51+
assert encoded == expected, f"Expected {expected}, got {encoded}"
52+
53+
# Decode should return original text
54+
decoded = tokenizer.decode(encoded)
55+
assert decoded == text
56+
57+
58+
def test_basic_functionality():
59+
"""Test basic tokenizer functionality"""
60+
tokenizer = BasicTokenizer()
61+
62+
# Train on simple text
63+
text = "hello world"
64+
tokenizer.train(text, vocab_size=300)
65+
66+
# Test that we can encode
67+
encoded = tokenizer.encode(text)
68+
assert isinstance(encoded, list)
69+
assert all(isinstance(token, int) for token in encoded)
70+
71+
# Test that we can decode
72+
decoded = tokenizer.decode(encoded)
73+
assert decoded == text
74+
75+
76+
def test_empty_string():
77+
"""Test handling of empty string"""
78+
tokenizer = BasicTokenizer()
79+
80+
# Train on some text first
81+
tokenizer.train("hello world", vocab_size=300)
82+
83+
# Test empty string
84+
encoded = tokenizer.encode("")
85+
decoded = tokenizer.decode(encoded)
86+
assert decoded == ""
87+
88+
89+
def test_single_character():
90+
"""Test handling of single character"""
91+
tokenizer = BasicTokenizer()
92+
93+
# Train on some text first
94+
tokenizer.train("hello world", vocab_size=300)
95+
96+
# Test single character
97+
text = "a"
98+
encoded = tokenizer.encode(text)
99+
decoded = tokenizer.decode(encoded)
100+
assert decoded == text
101+
102+
103+
def test_unicode_handling():
104+
"""Test proper handling of Unicode characters"""
105+
tokenizer = BasicTokenizer()
106+
107+
text = "안녕하세요 😊"
108+
tokenizer.train(text, vocab_size=300)
109+
110+
encoded = tokenizer.encode(text)
111+
decoded = tokenizer.decode(encoded)
112+
assert decoded == text
113+
114+
115+
def test_vocabulary_size_validation():
116+
"""Test that vocabulary size validation works"""
117+
tokenizer = BasicTokenizer()
118+
119+
# Should raise error for vocab size < 256
120+
with pytest.raises(AssertionError):
121+
tokenizer.train("test", vocab_size=255)
122+
123+
124+
def test_merge_operations():
125+
"""Test that merge operations work correctly"""
126+
tokenizer = BasicTokenizer()
127+
128+
# Use text with repeated patterns
129+
text = "aaaa"
130+
tokenizer.train(text, vocab_size=257) # Only one merge
131+
132+
# Should learn to merge "aa" pairs
133+
assert len(tokenizer._merges) == 1
134+
assert (97, 97) in tokenizer._merges # 'a' = 97 in ASCII
135+
136+
137+
def test_token_ids_in_range():
138+
"""Test that all token IDs are valid"""
139+
tokenizer = BasicTokenizer()
140+
141+
text = "hello world"
142+
tokenizer.train(text, vocab_size=300)
143+
144+
encoded = tokenizer.encode(text)
145+
146+
# All token IDs should be in the vocabulary
147+
for token_id in encoded:
148+
assert token_id in tokenizer._vocab
149+
150+
151+
def test_consecutive_pair_stats():
152+
"""Test the _get_consecutive_pair_stats helper function"""
153+
tokenizer = BasicTokenizer()
154+
155+
ids = [1, 2, 3, 1, 2]
156+
stats = tokenizer._get_consecutive_pair_stats(ids)
157+
158+
expected = {(1, 2): 2, (2, 3): 1, (3, 1): 1}
159+
assert stats == expected
160+
161+
162+
def test_merge_consecutive_pair():
163+
"""Test the _merge_consecutive_pair helper function"""
164+
tokenizer = BasicTokenizer()
165+
166+
ids = [1, 2, 3, 1, 2]
167+
pair = (1, 2)
168+
idx = 4
169+
170+
result = tokenizer._merge_consecutive_pair(ids, pair, idx)
171+
expected = [4, 3, 4]
172+
173+
assert result == expected
174+
175+
176+
def test_different_vocab_sizes():
177+
"""Test tokenizer with different vocabulary sizes"""
178+
text = "the quick brown fox jumps over the lazy dog " * 10 # Repeat text
179+
180+
for vocab_size in [256, 300, 400, 512]:
181+
tokenizer = BasicTokenizer()
182+
tokenizer.train(text, vocab_size=vocab_size)
183+
184+
encoded = tokenizer.encode(text)
185+
decoded = tokenizer.decode(encoded)
186+
187+
assert decoded == text
188+
# Vocabulary size should be <= requested size (may stop early)
189+
assert len(tokenizer._vocab) <= vocab_size
190+
assert len(tokenizer._vocab) >= 256 # Always has base 256 bytes

visualizations/index.html

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,42 @@ <h3>Group Anagrams</h3>
502502
</svg>
503503
</div>
504504
</a>
505+
<a href="strings/basic-tokenizer/index.html" class="algo-card">
506+
<div class="algo-content">
507+
<h3>BPE Tokenizer</h3>
508+
<p>Byte Pair Encoding: iteratively merge frequent byte pairs into tokens.</p>
509+
<span class="algo-tag">Tokenization</span>
510+
</div>
511+
<div class="algo-svg">
512+
<svg viewBox="0 0 80 80" fill="none" xmlns="http://www.w3.org/2000/svg">
513+
<!-- Raw bytes on top -->
514+
<rect x="3" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
515+
<text x="8" y="17" fill="#888" font-size="6" text-anchor="middle" font-family="monospace">t</text>
516+
<rect x="15" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
517+
<text x="20" y="17" fill="#888" font-size="6" text-anchor="middle" font-family="monospace">h</text>
518+
<rect x="27" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
519+
<text x="32" y="17" fill="#888" font-size="6" text-anchor="middle" font-family="monospace">e</text>
520+
<rect x="39" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
521+
<text x="44" y="17" fill="#888" font-size="5" text-anchor="middle" font-family="monospace">_</text>
522+
<rect x="51" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
523+
<text x="56" y="17" fill="#888" font-size="6" text-anchor="middle" font-family="monospace">c</text>
524+
<rect x="63" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
525+
<text x="68" y="17" fill="#888" font-size="6" text-anchor="middle" font-family="monospace">a</text>
526+
<!-- Arrow down -->
527+
<path d="M40 24 L40 34" stroke="#00d9ff" stroke-width="1.5"/>
528+
<path d="M37 31 L40 36 L43 31" stroke="#00d9ff" stroke-width="1.5" fill="none"/>
529+
<!-- Merged tokens on bottom -->
530+
<rect x="5" y="42" width="20" height="14" rx="3" fill="none" stroke="#4ecdc4" stroke-width="1.5"/>
531+
<text x="15" y="52" fill="#4ecdc4" font-size="7" text-anchor="middle" font-family="monospace">the</text>
532+
<rect x="29" y="42" width="12" height="14" rx="3" fill="none" stroke="#ff79c6" stroke-width="1.5"/>
533+
<text x="35" y="52" fill="#ff79c6" font-size="6" text-anchor="middle" font-family="monospace">_</text>
534+
<rect x="45" y="42" width="16" height="14" rx="3" fill="none" stroke="#ffd700" stroke-width="1.5"/>
535+
<text x="53" y="52" fill="#ffd700" font-size="7" text-anchor="middle" font-family="monospace">ca</text>
536+
<!-- Compression label -->
537+
<text x="40" y="72" fill="#00ff88" font-size="7" text-anchor="middle" font-family="monospace">6 → 3 tokens</text>
538+
</svg>
539+
</div>
540+
</a>
505541
</div>
506542
</div>
507543

@@ -596,6 +632,33 @@ <h3>Binary Matrix Shortest Path</h3>
596632
</svg>
597633
</div>
598634
</a>
635+
<a href="matrix/word-search/" class="algo-card">
636+
<div class="algo-content">
637+
<h3>Word Search</h3>
638+
<p>Find if a word exists in a 2D grid using DFS with backtracking.</p>
639+
<span class="algo-tag">DFS / Backtracking</span>
640+
</div>
641+
<div class="algo-svg">
642+
<svg viewBox="0 0 80 80" fill="none" xmlns="http://www.w3.org/2000/svg">
643+
<!-- 3x3 grid of letter cells -->
644+
<rect x="8" y="12" width="16" height="16" rx="2" fill="#4ecdc4" class="animate-on-hover"/>
645+
<text x="16" y="24" fill="#1a1a2e" font-size="10" font-weight="bold" text-anchor="middle">A</text>
646+
<rect x="26" y="12" width="16" height="16" rx="2" fill="#4ecdc4" class="animate-on-hover"/>
647+
<text x="34" y="24" fill="#1a1a2e" font-size="10" font-weight="bold" text-anchor="middle">B</text>
648+
<rect x="44" y="12" width="16" height="16" rx="2" fill="#3a3a5a"/>
649+
<text x="52" y="24" fill="#888" font-size="10" font-weight="bold" text-anchor="middle">C</text>
650+
<rect x="8" y="30" width="16" height="16" rx="2" fill="#3a3a5a"/>
651+
<text x="16" y="42" fill="#888" font-size="10" font-weight="bold" text-anchor="middle">S</text>
652+
<rect x="26" y="30" width="16" height="16" rx="2" fill="#4ecdc4" class="animate-on-hover"/>
653+
<text x="34" y="42" fill="#1a1a2e" font-size="10" font-weight="bold" text-anchor="middle">F</text>
654+
<rect x="44" y="30" width="16" height="16" rx="2" fill="#3a3a5a"/>
655+
<text x="52" y="42" fill="#888" font-size="10" font-weight="bold" text-anchor="middle">E</text>
656+
<!-- Search icon -->
657+
<circle cx="62" cy="60" r="8" stroke="#00d9ff" stroke-width="2" fill="none"/>
658+
<line x1="68" y1="66" x2="73" y2="71" stroke="#00d9ff" stroke-width="2"/>
659+
</svg>
660+
</div>
661+
</a>
599662
</div>
600663
</div>
601664

0 commit comments

Comments
 (0)