🎯 Why String Algorithms Matter
String algorithms are essential for:
- 🔍 Search Engines: Finding patterns in billions of documents
- 🧬 Bioinformatics: DNA sequence analysis and matching
- 📝 Text Editors: Find/replace, syntax highlighting
- 🔐 Security: Pattern detection in network traffic
💡 Real-World Example: When you use Ctrl+F in your browser, it uses string matching algorithms to find text instantly, even in huge documents!
🌟 KMP (Knuth-Morris-Pratt) Algorithm
How KMP Works
KMP avoids re-scanning characters by building a failure function (LPS array).
Pattern: "ABABCABAB"
A
B
A
B
C
A
B
A
B
0
0
1
2
0
1
2
3
4
LPS array values shown below
Implementation
def compute_lps(pattern):
"""Build the LPS (Longest Proper Prefix which is also Suffix) array"""
m = len(pattern)
lps = [0] * m
length = 0
i = 1
while i < m:
if pattern[i] == pattern[length]:
length += 1
lps[i] = length
i += 1
else:
if length != 0:
length = lps[length - 1]
else:
lps[i] = 0
i += 1
return lps
def kmp_search(text, pattern):
"""KMP pattern searching algorithm"""
n = len(text)
m = len(pattern)
if m == 0:
return []
lps = compute_lps(pattern)
matches = []
i = 0 # Index for text
j = 0 # Index for pattern
while i < n:
if text[i] == pattern[j]:
i += 1
j += 1
if j == m:
matches.append(i - j)
j = lps[j - 1]
elif i < n and text[i] != pattern[j]:
if j != 0:
j = lps[j - 1]
else:
i += 1
return matches
# Example
text = "ABABDABACDABABCABAB"
pattern = "ABABCABAB"
print(f"Pattern found at: {kmp_search(text, pattern)}")
💡 Time Complexity: O(n + m) where n is text length and m is pattern length. Much better than naive O(n×m)!
🎯 Rabin-Karp Algorithm
Rolling Hash Concept
Uses hashing to find patterns efficiently, with rolling hash for O(1) updates.
def rabin_karp(text, pattern):
"""Rabin-Karp pattern matching using rolling hash"""
n = len(text)
m = len(pattern)
base = 256 # Number of characters
prime = 101 # A prime number
matches = []
if m > n:
return matches
# Calculate hash for pattern and first window
pattern_hash = 0
text_hash = 0
h = 1
# h = base^(m-1) % prime
for i in range(m - 1):
h = (h * base) % prime
# Calculate initial hashes
for i in range(m):
pattern_hash = (base * pattern_hash + ord(pattern[i])) % prime
text_hash = (base * text_hash + ord(text[i])) % prime
# Slide the window
for i in range(n - m + 1):
# Check if hashes match
if pattern_hash == text_hash:
# Verify character by character
if text[i:i + m] == pattern:
matches.append(i)
# Calculate hash for next window (rolling hash)
if i < n - m:
text_hash = (base * (text_hash - ord(text[i]) * h) +
ord(text[i + m])) % prime
# Handle negative values
if text_hash < 0:
text_hash += prime
return matches
# Example: Finding all occurrences
text = "abracadabra"
pattern = "abra"
print(f"Pattern found at: {rabin_karp(text, pattern)}")
# Output: Pattern found at: [0, 7]
Applications
- 🔍 Multiple pattern search
- 🔐 Plagiarism detection
- 🧬 DNA sequence matching
- 📝 Finding duplicate documents
💫 Suffix Arrays
What are Suffix Arrays?
A sorted array of all suffixes of a string, enabling fast pattern search and string operations.
def build_suffix_array(s):
"""Build suffix array for string s"""
n = len(s)
suffixes = []
# Create all suffixes with their indices
for i in range(n):
suffixes.append((s[i:], i))
# Sort suffixes
suffixes.sort()
# Extract indices
suffix_array = [suffix[1] for suffix in suffixes]
return suffix_array
def search_pattern(text, pattern, suffix_array):
"""Binary search for pattern using suffix array"""
n = len(text)
m = len(pattern)
left, right = 0, n - 1
# Binary search for leftmost occurrence
while left < right:
mid = (left + right) // 2
suffix = text[suffix_array[mid]:]
if suffix < pattern:
left = mid + 1
else:
right = mid
# Check if pattern found
matches = []
while left < n:
suffix = text[suffix_array[left]:]
if suffix.startswith(pattern):
matches.append(suffix_array[left])
left += 1
else:
break
return sorted(matches)
# Example
text = "banana"
sa = build_suffix_array(text)
print("Suffix Array:", sa)
print("Suffixes in order:")
for idx in sa:
print(f" {idx}: {text[idx:]}")
# Search for pattern
pattern = "ana"
matches = search_pattern(text, pattern, sa)
print(f"\nPattern '{pattern}' found at: {matches}")
Operation | Time Complexity | Space |
---|---|---|
Build Suffix Array | O(n log n) | O(n) |
Pattern Search | O(m log n) | O(1) |
Longest Common Substring | O(n) | O(n) |
🚀 Advanced String Topics
Z-Algorithm
def z_algorithm(s):
"""Compute Z-array where Z[i] is length of longest substring
starting from i which is also a prefix of s"""
n = len(s)
z = [0] * n
z[0] = n
l, r = 0, 0
for i in range(1, n):
if i > r:
l, r = i, i
while r < n and s[r - l] == s[r]:
r += 1
z[i] = r - l
r -= 1
else:
k = i - l
if z[k] < r - i + 1:
z[i] = z[k]
else:
l = i
while r < n and s[r - l] == s[r]:
r += 1
z[i] = r - l
r -= 1
return z
# Use Z-algorithm for pattern matching
def z_pattern_search(text, pattern):
combined = pattern + "$" + text
z = z_algorithm(combined)
matches = []
pattern_len = len(pattern)
for i in range(pattern_len + 1, len(combined)):
if z[i] == pattern_len:
matches.append(i - pattern_len - 1)
return matches
Manacher's Algorithm (Palindromes)
def longest_palindrome(s):
"""Find longest palindromic substring using Manacher's algorithm"""
# Transform string to handle even/odd cases
t = '#'.join('^{}$'.format(s))
n = len(t)
p = [0] * n
center = right = 0
for i in range(1, n - 1):
mirror = 2 * center - i
if i < right:
p[i] = min(right - i, p[mirror])
# Try to expand palindrome
while t[i + p[i] + 1] == t[i - p[i] - 1]:
p[i] += 1
# Update center and right
if i + p[i] > right:
center, right = i, i + p[i]
# Find longest palindrome
max_len = max(p)
center_idx = p.index(max_len)
start = (center_idx - max_len) // 2
return s[start:start + max_len]
# Example
text = "babad"
print(f"Longest palindrome: {longest_palindrome(text)}")
💪 Practice Problems
Problem 1: Find All Anagrams
Medium
def find_anagrams(s, p):
"""Find all anagrams of p in s"""
from collections import Counter
if len(p) > len(s):
return []
result = []
p_count = Counter(p)
window = Counter(s[:len(p)])
if window == p_count:
result.append(0)
for i in range(len(p), len(s)):
# Add new character
window[s[i]] = window.get(s[i], 0) + 1
# Remove old character
old_char = s[i - len(p)]
window[old_char] -= 1
if window[old_char] == 0:
del window[old_char]
# Check if anagram
if window == p_count:
result.append(i - len(p) + 1)
return result
# Test
s = "cbaebabacd"
p = "abc"
print(f"Anagrams at indices: {find_anagrams(s, p)}")
# Output: [0, 6]
Problem 2: Longest Substring Without Repeating
Medium
def longest_substring_no_repeat(s):
"""Find length of longest substring without repeating characters"""
char_index = {}
max_length = 0
start = 0
for i, char in enumerate(s):
if char in char_index and char_index[char] >= start:
start = char_index[char] + 1
char_index[char] = i
max_length = max(max_length, i - start + 1)
return max_length
# Test
print(longest_substring_no_repeat("abcabcbb")) # 3 ("abc")
print(longest_substring_no_repeat("bbbbb")) # 1 ("b")
print(longest_substring_no_repeat("pwwkew")) # 3 ("wke")
💡 Interview Tip: String problems often combine with sliding window, hash tables, or dynamic programming. Practice recognizing these patterns!