joshiayush
diff --git a/‎algos/matrix/word_search.py‎
Lines changed: 45 additions & 0 deletions b/‎algos/matrix/word_search.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎algos/strings/basic_tokenizer.py‎
Lines changed: 83 additions & 0 deletions b/‎algos/strings/basic_tokenizer.py‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎tests/strings/test_basic_tokenizer.py‎
Lines changed: 190 additions & 0 deletions b/‎tests/strings/test_basic_tokenizer.py‎
Lines changed: 190 additions & 0 deletions
diff --git a/‎visualizations/index.html‎
Lines changed: 63 additions & 0 deletions b/‎visualizations/index.html‎
Lines changed: 63 additions & 0 deletions
@@ -0,0 +1,45 @@
+"""Given an `m x n` grid of characters board and a string word, return true if word
+exists in the grid.
+
+The word can be constructed from letters of sequentially adjacent cells, where adjacent
+cells are horizontally or vertically neighboring. The same letter cell may not be used
+more than once.
+"""
+
+from typing import List
+
+
+class Solution:
+    def _is_in_bounds(self, i: int, j: int, rows: int, cols: int) -> bool:
+        return 0 <= i < rows and 0 <= j < cols
+
+    def exist(self, board: List[List[str]], word: str) -> bool:
+        rows, cols = len(board), len(board[0])
+
+        def dfs(i: int, j: int, k: int) -> bool:
+            if not self._is_in_bounds(i, j, rows, cols) or not board[i][j] == word[k]:
+                return False
+
+            if k == len(word) - 1:
+                return True
+
+            temp = board[i][j]
+            board[i][j] = "#"
+
+            found = (
+                dfs(i - 1, j, k + 1)
+                or dfs(i, j + 1, k + 1)
+                or dfs(i + 1, j, k + 1)
+                or dfs(i, j - 1, k + 1)
+            )
+
+            board[i][j] = temp
+            return found
+
+        for i in range(rows):
+            for j in range(cols):
+                if board[i][j] == word[0]:
+                    if dfs(i, j, 0):
+                        return True
+
+        return False
@@ -0,0 +1,83 @@
+from typing import List, Dict, Optional, Tuple
+
+
+class BasicTokenizer:
+
+    def __init__(self) -> None:
+        self._merges = {}
+        self._vocab = {}
+
+    def _get_consecutive_pair_stats(
+        self, ids: List[int], counts: Optional[Dict] = None
+    ) -> Dict:
+        """Given a list of integers, return a dictionary of counts of
+        consecutive pairs.
+
+        Optionally allows to update an existing dictionary of counts.
+
+        Example:
+        >>> [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
+        """
+        counts = counts or {}
+        # iterate consecutive elements
+        for i1, i2 in zip(ids, ids[1:]):
+            counts[(i1, i2)] = counts.get((i1, i2), 0) + 1
+        return counts
+
+    def _merge_consecutive_pair(
+        self, ids: List[int], pair: Tuple[int, int], idx: int
+    ) -> List[int]:
+        """In the list of integers (ids), replace all consecutive occurrences
+        of pair with the new integer token idx.
+
+        Example:
+        >>> ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
+        """
+        out = []
+        i = 0
+        while i < len(ids):
+            # if not at the very last position AND the pair matches, replace it
+            if ids[i] == pair[0] and i < len(ids) - 1 and ids[i + 1] == pair[1]:
+                out.append(idx)
+                i += 2
+            else:
+                out.append(ids[i])
+                i += 1
+        return out
+
+    def train(self, text: str, vocab_size: int = 256) -> None:
+        assert vocab_size >= 256
+        n_merges = vocab_size - 256
+
+        ids = list(text.encode("utf-8"))
+        self._vocab = {idx: bytes([idx]) for idx in range(256)}
+        # iteratively merge the most common pairs to create new
+        # tokens; (int, int) -> int
+        for i in range(n_merges):
+            consecutive_pair_stats = self._get_consecutive_pair_stats(ids)
+            if not consecutive_pair_stats:
+                break
+            (i1, i2) = max(consecutive_pair_stats, key=consecutive_pair_stats.get)
+            # mint a new token: assign it the next available id
+            idx = 256 + i
+            ids = self._merge_consecutive_pair(ids, (i1, i2), idx)
+            self._merges[(i1, i2)] = idx
+            self._vocab[idx] = self._vocab[i1] + self._vocab[i2]
+
+    def encode(self, text: str) -> List[int]:
+        ids = list(text.encode("utf-8"))
+        while len(ids) >= 2:
+            consecutive_pair_stats = self._get_consecutive_pair_stats(ids)
+            (i1, i2) = min(
+                consecutive_pair_stats, key=lambda p: self._merges.get(p, float("inf"))
+            )
+            # if we did not learn this pair then break
+            if (i1, i2) not in self._merges:
+                break
+            idx = self._merges[(i1, i2)]
+            ids = self._merge_consecutive_pair(ids, (i1, i2), idx)
+        return ids
+
+    def decode(self, ids: List[int]) -> str:
+        text_bytes = b"".join(self._vocab[id] for id in ids)
+        return text_bytes.decode("utf-8", errors="replace")
@@ -0,0 +1,190 @@
+import pytest
+
+from algos.strings.basic_tokenizer import BasicTokenizer
+
+# Test data - various strings to test tokenization
+test_strings = [
+    # empty string
+    "",
+    # single character
+    "?",
+    # Unicode and emoji
+    "hello world!!!? (안녕하세요!) lol123 😉",
+    # Wikipedia example
+    "aaabdaaabac",
+    # Common phrase
+    "The quick brown fox jumps over the lazy dog",
+    # Numbers only
+    "1234567890",
+    # Special characters only
+    "!!!@@@###",
+]
+
+
+@pytest.mark.parametrize("text", test_strings)
+def test_encode_decode_identity(text):
+    """Test that encoding and decoding preserves the original text"""
+    tokenizer = BasicTokenizer()
+
+    # Train with a reasonable vocabulary size
+    tokenizer.train(text, vocab_size=512)
+
+    # Test encode/decode roundtrip
+    encoded = tokenizer.encode(text)
+    decoded = tokenizer.decode(encoded)
+
+    assert text == decoded, f"Roundtrip failed for: {text}"
+
+
+def test_wikipedia_example():
+    """Test the Wikipedia BPE example from the documentation"""
+    tokenizer = BasicTokenizer()
+    text = "aaabdaaabac"
+
+    # Train with exactly 3 merges (256 base + 3 new tokens)
+    tokenizer.train(text, vocab_size=256 + 3)
+
+    # Encode should produce the expected token sequence
+    encoded = tokenizer.encode(text)
+    expected = [258, 100, 258, 97, 99]  # Based on Wikipedia example
+
+    assert encoded == expected, f"Expected {expected}, got {encoded}"
+
+    # Decode should return original text
+    decoded = tokenizer.decode(encoded)
+    assert decoded == text
+
+
+def test_basic_functionality():
+    """Test basic tokenizer functionality"""
+    tokenizer = BasicTokenizer()
+
+    # Train on simple text
+    text = "hello world"
+    tokenizer.train(text, vocab_size=300)
+
+    # Test that we can encode
+    encoded = tokenizer.encode(text)
+    assert isinstance(encoded, list)
+    assert all(isinstance(token, int) for token in encoded)
+
+    # Test that we can decode
+    decoded = tokenizer.decode(encoded)
+    assert decoded == text
+
+
+def test_empty_string():
+    """Test handling of empty string"""
+    tokenizer = BasicTokenizer()
+
+    # Train on some text first
+    tokenizer.train("hello world", vocab_size=300)
+
+    # Test empty string
+    encoded = tokenizer.encode("")
+    decoded = tokenizer.decode(encoded)
+    assert decoded == ""
+
+
+def test_single_character():
+    """Test handling of single character"""
+    tokenizer = BasicTokenizer()
+
+    # Train on some text first
+    tokenizer.train("hello world", vocab_size=300)
+
+    # Test single character
+    text = "a"
+    encoded = tokenizer.encode(text)
+    decoded = tokenizer.decode(encoded)
+    assert decoded == text
+
+
+def test_unicode_handling():
+    """Test proper handling of Unicode characters"""
+    tokenizer = BasicTokenizer()
+
+    text = "안녕하세요 😊"
+    tokenizer.train(text, vocab_size=300)
+
+    encoded = tokenizer.encode(text)
+    decoded = tokenizer.decode(encoded)
+    assert decoded == text
+
+
+def test_vocabulary_size_validation():
+    """Test that vocabulary size validation works"""
+    tokenizer = BasicTokenizer()
+
+    # Should raise error for vocab size < 256
+    with pytest.raises(AssertionError):
+        tokenizer.train("test", vocab_size=255)
+
+
+def test_merge_operations():
+    """Test that merge operations work correctly"""
+    tokenizer = BasicTokenizer()
+
+    # Use text with repeated patterns
+    text = "aaaa"
+    tokenizer.train(text, vocab_size=257)  # Only one merge
+
+    # Should learn to merge "aa" pairs
+    assert len(tokenizer._merges) == 1
+    assert (97, 97) in tokenizer._merges  # 'a' = 97 in ASCII
+
+
+def test_token_ids_in_range():
+    """Test that all token IDs are valid"""
+    tokenizer = BasicTokenizer()
+
+    text = "hello world"
+    tokenizer.train(text, vocab_size=300)
+
+    encoded = tokenizer.encode(text)
+
+    # All token IDs should be in the vocabulary
+    for token_id in encoded:
+        assert token_id in tokenizer._vocab
+
+
+def test_consecutive_pair_stats():
+    """Test the _get_consecutive_pair_stats helper function"""
+    tokenizer = BasicTokenizer()
+
+    ids = [1, 2, 3, 1, 2]
+    stats = tokenizer._get_consecutive_pair_stats(ids)
+
+    expected = {(1, 2): 2, (2, 3): 1, (3, 1): 1}
+    assert stats == expected
+
+
+def test_merge_consecutive_pair():
+    """Test the _merge_consecutive_pair helper function"""
+    tokenizer = BasicTokenizer()
+
+    ids = [1, 2, 3, 1, 2]
+    pair = (1, 2)
+    idx = 4
+
+    result = tokenizer._merge_consecutive_pair(ids, pair, idx)
+    expected = [4, 3, 4]
+
+    assert result == expected
+
+
+def test_different_vocab_sizes():  
+    """Test tokenizer with different vocabulary sizes"""  
+    text = "the quick brown fox jumps over the lazy dog " * 10  # Repeat text  
+      
+    for vocab_size in [256, 300, 400, 512]:  
+        tokenizer = BasicTokenizer()  
+        tokenizer.train(text, vocab_size=vocab_size)  
+          
+        encoded = tokenizer.encode(text)  
+        decoded = tokenizer.decode(encoded)  
+          
+        assert decoded == text  
+        # Vocabulary size should be <= requested size (may stop early)  
+        assert len(tokenizer._vocab) <= vocab_size  
+        assert len(tokenizer._vocab) >= 256  # Always has base 256 bytes
@@ -502,6 +502,42 @@ <h3>Group Anagrams</h3>
                             </svg>
                         </div>
                     </a>
+                    <a href="strings/basic-tokenizer/index.html" class="algo-card">
+                        <div class="algo-content">
+                            <h3>BPE Tokenizer</h3>
+                            <p>Byte Pair Encoding: iteratively merge frequent byte pairs into tokens.</p>
+                            <span class="algo-tag">Tokenization</span>
+                        </div>
+                        <div class="algo-svg">
+                            <svg viewBox="0 0 80 80" fill="none" xmlns="http://www.w3.org/2000/svg">
+                                <!-- Raw bytes on top -->
+                                <rect x="3" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
+                                <text x="8" y="17" fill="#888" font-size="6" text-anchor="middle" font-family="monospace">t</text>
+                                <rect x="15" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
+                                <text x="20" y="17" fill="#888" font-size="6" text-anchor="middle" font-family="monospace">h</text>
+                                <rect x="27" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
+                                <text x="32" y="17" fill="#888" font-size="6" text-anchor="middle" font-family="monospace">e</text>
+                                <rect x="39" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
+                                <text x="44" y="17" fill="#888" font-size="5" text-anchor="middle" font-family="monospace">_</text>
+                                <rect x="51" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
+                                <text x="56" y="17" fill="#888" font-size="6" text-anchor="middle" font-family="monospace">c</text>
+                                <rect x="63" y="8" width="10" height="12" rx="2" fill="#2a2a4a" stroke="#666" stroke-width="1"/>
+                                <text x="68" y="17" fill="#888" font-size="6" text-anchor="middle" font-family="monospace">a</text>
+                                <!-- Arrow down -->
+                                <path d="M40 24 L40 34" stroke="#00d9ff" stroke-width="1.5"/>
+                                <path d="M37 31 L40 36 L43 31" stroke="#00d9ff" stroke-width="1.5" fill="none"/>
+                                <!-- Merged tokens on bottom -->
+                                <rect x="5" y="42" width="20" height="14" rx="3" fill="none" stroke="#4ecdc4" stroke-width="1.5"/>
+                                <text x="15" y="52" fill="#4ecdc4" font-size="7" text-anchor="middle" font-family="monospace">the</text>
+                                <rect x="29" y="42" width="12" height="14" rx="3" fill="none" stroke="#ff79c6" stroke-width="1.5"/>
+                                <text x="35" y="52" fill="#ff79c6" font-size="6" text-anchor="middle" font-family="monospace">_</text>
+                                <rect x="45" y="42" width="16" height="14" rx="3" fill="none" stroke="#ffd700" stroke-width="1.5"/>
+                                <text x="53" y="52" fill="#ffd700" font-size="7" text-anchor="middle" font-family="monospace">ca</text>
+                                <!-- Compression label -->
+                                <text x="40" y="72" fill="#00ff88" font-size="7" text-anchor="middle" font-family="monospace">6 → 3 tokens</text>
+                            </svg>
+                        </div>
+                    </a>
                 </div>
             </div>
 
@@ -596,6 +632,33 @@ <h3>Binary Matrix Shortest Path</h3>
                             </svg>
                         </div>
                     </a>
+                    <a href="matrix/word-search/" class="algo-card">
+                        <div class="algo-content">
+                            <h3>Word Search</h3>
+                            <p>Find if a word exists in a 2D grid using DFS with backtracking.</p>
+                            <span class="algo-tag">DFS / Backtracking</span>
+                        </div>
+                        <div class="algo-svg">
+                            <svg viewBox="0 0 80 80" fill="none" xmlns="http://www.w3.org/2000/svg">
+                                <!-- 3x3 grid of letter cells -->
+                                <rect x="8" y="12" width="16" height="16" rx="2" fill="#4ecdc4" class="animate-on-hover"/>
+                                <text x="16" y="24" fill="#1a1a2e" font-size="10" font-weight="bold" text-anchor="middle">A</text>
+                                <rect x="26" y="12" width="16" height="16" rx="2" fill="#4ecdc4" class="animate-on-hover"/>
+                                <text x="34" y="24" fill="#1a1a2e" font-size="10" font-weight="bold" text-anchor="middle">B</text>
+                                <rect x="44" y="12" width="16" height="16" rx="2" fill="#3a3a5a"/>
+                                <text x="52" y="24" fill="#888" font-size="10" font-weight="bold" text-anchor="middle">C</text>
+                                <rect x="8" y="30" width="16" height="16" rx="2" fill="#3a3a5a"/>
+                                <text x="16" y="42" fill="#888" font-size="10" font-weight="bold" text-anchor="middle">S</text>
+                                <rect x="26" y="30" width="16" height="16" rx="2" fill="#4ecdc4" class="animate-on-hover"/>
+                                <text x="34" y="42" fill="#1a1a2e" font-size="10" font-weight="bold" text-anchor="middle">F</text>
+                                <rect x="44" y="30" width="16" height="16" rx="2" fill="#3a3a5a"/>
+                                <text x="52" y="42" fill="#888" font-size="10" font-weight="bold" text-anchor="middle">E</text>
+                                <!-- Search icon -->
+                                <circle cx="62" cy="60" r="8" stroke="#00d9ff" stroke-width="2" fill="none"/>
+                                <line x1="68" y1="66" x2="73" y2="71" stroke="#00d9ff" stroke-width="2"/>
+                            </svg>
+                        </div>
+                    </a>
                 </div>
             </div>