Merge pull request nryoung#22 from jabagawee/bmh_search

nryoung · nryoung · commit 96a5b1375bec · 2012-10-03T18:17:46.000-07:00
BMH  search
diff --git a/algorithms/searching/bmh_search.py b/algorithms/searching/bmh_search.py
@@ -8,11 +8,27 @@
     Uses a bad-character shift of the rightmost character of the window to
     compute shifts.
 
+    The trick to this algorithm is `bmbc`, a lookup table with a default
+    value equal to the length of the pattern to be searched, so that
+    the algorithm can skip `len(pattern)` indices through the string
+    for efficiency's sake. For example, if we're searching through the
+    string "cotton milled paper" for the pattern "grumble", we look at
+    the last letter "r" (BMH goes backwards through a string) and notices
+    that it is not equal to "e". Thus, we can afford to jump our search
+    index back a whole seven characters.
+
+    However, not all the entries in `bmbc` are equal to `len(pattern)`.
+    If we searched the string "adventure time" for "grumble", we'd find
+    the "e" to match but mismatch the "m" and "l" in the string and
+    pattern, respectively. In this case, we can only jump back six
+    characters safely, which is why `bmbc` contains values that are not
+    simply `len(pattern)`.
+
     Pre: a string > substring.
 
-    Post: returns a list of indexes where the substring was found.
+    Post: returns a list of indices where the substring was found.
 
-    Time: Complexity: O( m + n), where m is the substring to be found.
+    Time: Complexity: O(m + n), where m is the substring to be found.
 
     Space: Complexity: O(m), where m is the substring to be found.
 
@@ -25,23 +41,25 @@
 
 
 def search(text, pattern):
-    m, n = len(pattern), len(text)
+    pattern_length = len(pattern)
+    text_length = len(text)
     offsets = []
-    if m > n:
+    if pattern_length > text_length:
         return offsets
-    bmbc = [m] * 256
-    for k, p in enumerate(pattern[:-1]):
-        bmbc[ord(p)] = m - k - 1
+    bmbc = [pattern_length] * 256
+    for index, char in enumerate(pattern[:-1]):
+        bmbc[ord(char)] = pattern_length - index - 1
     bmbc = tuple(bmbc)
-    k = m - 1
-    while k < n:
-        j = m - 1
-        i = k
-        while j >= 0 and text[i] == pattern[j]:
-            j -= 1
-            i -= 1
-        if j == -1:
-            offsets.append(i + 1)
-        k += bmbc[ord(text[k])]
+    search_index = pattern_length - 1
+    while search_index < text_length:
+        pattern_index = pattern_length - 1
+        text_index = search_index
+        while text_index >= 0 and \
+              text[text_index] == pattern[pattern_index]:
+            pattern_index -= 1
+            text_index -= 1
+        if pattern_index == -1:
+            offsets.append(text_index + 1)
+        search_index += bmbc[ord(text[search_index])]
 
     return offsets