,

Contents · String Matching (KMP, Z, Rabin–Karp, Aho–Corasick)


Overview and Use Cases

  • KMP: exact single-pattern search in O(n+m), linear-time.
  • Z-algorithm: builds Z-array to compare prefix against every position, great for pattern+"#"+text tricks.
  • Rabin–Karp: rolling hash for multiple patterns or plagiarism search; expected linear, beware collisions.
  • Aho–Corasick: automaton for many patterns simultaneously, O(n + total matches + automaton size).

KMP: Prefix Function / LPS

// Build longest prefix-suffix array (lps)
function buildLPS(p) {
  const lps = Array(p.length).fill(0);
  for (let i = 1, k = 0; i < p.length; i++) {
    while (k > 0 && p[i] !== p[k]) k = lps[k - 1];
    if (p[i] === p[k]) k++;
    lps[i] = k;
  }
  return lps;
}

// KMP search: return starting indices of matches of p in s
function kmpSearch(s, p) {
  const lps = buildLPS(p), res = [];
  for (let i = 0, k = 0; i < s.length; i++) {
    while (k > 0 && s[i] !== p[k]) k = lps[k - 1];
    if (s[i] === p[k]) k++;
    if (k === p.length) { res.push(i - k + 1); k = lps[k - 1]; }
  }
  return res;
}

Z-Algorithm

function zArray(s) {
  const n = s.length, Z = Array(n).fill(0);
  for (let i = 1, l = 0, r = 0; i < n; i++) {
    if (i <= r) Z[i] = Math.min(r - i + 1, Z[i - l]);
    while (i + Z[i] < n && s[Z[i]] === s[i + Z[i]]) Z[i]++;
    if (i + Z[i] - 1 > r) { l = i; r = i + Z[i] - 1; }
  }
  return Z;
}

// Pattern occurrences via Z on P#T
function findOccurrencesZ(text, pattern) {
  const s = pattern + '#' + text;
  const Z = zArray(s);
  const m = pattern.length;
  const out = [];
  for (let i = m + 1; i < s.length; i++) if (Z[i] >= m) out.push(i - m - 1);
  return out;
}

Rabin–Karp (Rolling Hash)

// Single-pattern RK with mod; adapt base/mod to your alphabet
function rabinKarp(s, p) {
  const n = s.length, m = p.length;
  if (m === 0 || m > n) return [];
  const B = 911382323, M = 1_000_000_007; // example base and mod
  let ph = 0, sh = 0, power = 1;
  for (let i = 0; i < m; i++) {
    ph = (ph * B + p.charCodeAt(i)) % M;
    sh = (sh * B + s.charCodeAt(i)) % M;
    if (i < m - 1) power = (power * B) % M;
  }
  const res = [];
  const eq = (i) => s.slice(i, i + m) === p; // verify to avoid collisions
  for (let i = 0; i + m <= n; i++) {
    if (sh === ph && eq(i)) res.push(i);
    if (i + m < n) {
      sh = ( (sh - s.charCodeAt(i) * power % M + M) % M * B + s.charCodeAt(i + m) ) % M;
    }
  }
  return res;
}

Aho–Corasick (Multiple Patterns)

// Minimal illustrative AC automaton (ASCII)
function AhoCorasick(patterns) {
  const ALPHA = 128;
  const next = [], link = [], out = [];
  next.push(Array(ALPHA).fill(-1)); link.push(0); out.push([]);
  // build trie
  for (let id = 0; id < patterns.length; id++) {
    const p = patterns[id]; let v = 0;
    for (const ch of p) {
      const c = ch.charCodeAt(0);
      if (next[v][c] === -1) { next[v][c] = next.length; next.push(Array(ALPHA).fill(-1)); link.push(0); out.push([]); }
      v = next[v][c];
    }
    out[v].push(id);
  }
  // build suffix links (BFS)
  const q = [];
  for (let c = 0; c < ALPHA; c++) if (next[0][c] !== -1) q.push(next[0][c]); else next[0][c] = 0;
  while (q.length) {
    const v = q.shift();
    for (let c = 0; c < ALPHA; c++) {
      const u = next[v][c];
      if (u !== -1) { link[u] = next[link[v]][c]; out[u] = out[u].concat(out[link[u]]); q.push(u); }
      else next[v][c] = next[link[v]][c];
    }
  }
  return { next, link, out };
}

function findAllAC(text, patterns) {
  const { next, out } = AhoCorasick(patterns);
  let v = 0; const matches = [];
  for (let i = 0; i < text.length; i++) {
    v = next[v][text.charCodeAt(i)];
    for (const id of out[v]) matches.push({ index: i - patterns[id].length + 1, id });
  }
  return matches;
}

Exercises

  1. Implement both KMP and Z and compare performance on random inputs.
  2. Extend Rabin–Karp to support multiple patterns using a set of hashes.
  3. Build Aho–Corasick and report all occurrences of a dictionary in a text.
Pick the right tool: KMP/Z for single pattern, RK for hashing-friendly tasks, AC for many patterns.