twoway/
bmh.rs

1//! Boyer-Moore-Horspool
2//!
3
4extern crate memchr;
5use std::cmp;
6
7fn bmh_skip(pat: &[u8], skip: &mut [u16; 256]) {
8    let pat_skip = cmp::min(pat.len(), u16::max_value() as usize) as u16;
9    for entry in skip.iter_mut() {
10        *entry = pat_skip;
11    }
12
13    for (index, &byte) in pat[..pat.len() - 1].iter().enumerate() {
14        skip[byte as usize] = cmp::min(pat.len() - index - 1, u16::max_value() as usize) as u16;
15    }
16}
17
18/// Boyer-Moore-Horspool substring search
19pub fn find(text: &[u8], pat: &[u8]) -> Option<usize> {
20    let mut skip = [0; 256];
21    bmh_skip(pat, &mut skip);
22
23    let pat_len = pat.len();
24
25    if pat_len == 0 {
26        return Some(0);
27    }
28
29    let pat_len_m1 = pat_len - 1;
30    let pat_last = pat[pat_len - 1];
31
32    // initial search by memchr
33    let mut j = match memchr::memchr(pat[0], text) {
34        Some(x) => x,
35        None => return None,
36    };
37
38    while let Some(&c) = text.get(j + pat_len_m1) {
39        // check the back character of the pattern
40        if c == pat_last && &text[j..j + pat_len] == pat {
41            return Some(j);
42        }
43        j += skip[c as usize] as usize;
44    }
45    None
46}
47
48#[test]
49fn bmh_preprocess() {
50    let mut skip = [0; 256];
51    let needle = b"gcagagag";
52    bmh_skip(needle, &mut skip);
53    assert_eq!(skip[b'g' as usize], 2);
54    assert_eq!(skip[b'c' as usize], 6);
55    assert_eq!(skip[b'a' as usize], 1);
56    assert_eq!(skip[b't' as usize], 8);
57}
58
59#[test]
60fn bmh_find() {
61    let text = b"abc";
62    assert_eq!(find(text, b"d"), None);
63    assert_eq!(find(text, b"c"), Some(2));
64
65    let longer = "longer text and so on";
66
67    // test all windows
68    for wsz in 1..17 {
69        for window in longer.as_bytes().windows(wsz) {
70            let str_find = longer.find(::std::str::from_utf8(window).unwrap());
71            assert!(str_find.is_some());
72            assert_eq!(find(longer.as_bytes(), window), str_find);
73        }
74    }
75
76    let pat = b"ger text and so on";
77    assert!(pat.len() > 16);
78    assert_eq!(Some(3), find(longer.as_bytes(), pat));
79}