KMP searching
Time complexity: O(m+n)
Can use to detect longest parlindrom from the beginning
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
| final int MAX_N = 1_000_001;
char[] T = new char[MAX_N];
char[] P = new char[MAX_N]; // T = text, P = pattern
int[] b = new int[MAX_N];
int n, m; // b = back table, n = length of T, m = length of P
void kmpPreprocess() { // call this before calling kmpSearch()
int i = 0, j = -1;
b[0] = -1; // starting values
while (i < m) { // pre-process the pattern string P
while (j >= 0 && P[i] != P[j]) j = b[j]; // different, reset j using b
i++;
j++; // if same, advance both pointers
b[i] = j; // observe i = 8, 9, 10, 11, 12, 13 with j = 0, 1, 2, 3, 4, 5
}
} // in the example of P = "SEVENTY SEVEN" above
void kmpSearch() { // this is similar as kmpPreprocess(), but on string T
int i = 0, j = 0; // starting values
while (i < n) { // search through string T
while (j >= 0 && T[i] != P[j]) j = b[j]; // different, reset j using b
i++;
j++; // if same, advance both pointers
if (j == m) { // a match found when j == m
System.out.format("%s is found at index %d in T\n", String.valueOf(P, 0, m), i - j);
j = b[j]; // prepare j for the next possible match
}
}
}
|
Refer: UVA1449
Wildcard matching
Time complexity: O(S+P)
Similar to KMP matching, just go back to the last star position and match again, but now star match 1 more character.
Better than DP.
Refer: Wildcard Matching
While regular expression matching require ‘ab’ which need dynamic progamming. O(n^m)
Refer: Regular Expression Matching
Minimum edit distance
Operation include: insert, delete, replace
Time complexity: O(m*n)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
| public int minDistance(String word1, String word2) {
int[][] dp = new int[word1.length()+1][word2.length()+1];
dp[0][0] = 0;
for(int i=0; i<word1.length(); ++i)
dp[i+1][0] = i+1; //delete char
for(int i=0; i<word2.length(); ++i)
dp[0][i+1] = i+1; //delete char
for(int i=1; i<=word1.length(); ++i){
for(int j=1; j<=word2.length(); ++j){
char ci = word1.charAt(i-1);
char cj = word2.charAt(j-1);
if(ci==cj)
dp[i][j] = dp[i-1][j-1];
else
dp[i][j] = dp[i-1][j-1]+1;
dp[i][j] = Math.min(dp[i][j], Math.min(dp[i-1][j]+1, dp[i][j-1]+1));
}
}
return dp[word1.length()][word2.length()];
}
|
Refer: Edit Distance
Longest common substring
Longest palindrom
From starting
In between
Suffix trie
Suffix tree
- string matching O(m + occ)
- longest repeating substring O(n)
Suffix array
Construction of SA
Time complexity: O(n log(n))
Time complexity of sorting string array using strcmp is O(n^2 log(n))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
| int MAX_N = 100_010; // second approach: O(n log n)
char[] T = new char[MAX_N]; // the input string, up to 100K characters
int n; // the length of input string
int[] RA = new int[MAX_N], tempRA = new int[MAX_N]; // rank array and temporary rank array
int[] SA = new int [MAX_N], tempSA = new int[MAX_N]; // suffix array and temporary suffix array
int[] c = new int[MAX_N];
void countingSort(int k) { // O(n)
int i, sum, maxi = Math.max(300, n); // up to 255 ASCII chars or length of n
Arrays.fill(c, 0); // clear frequency table
for (i = 0; i < n; i++) // count the frequency of each integer rank
c[i + k < n ? RA[i + k] : 0]++;
for (i = sum = 0; i < maxi; i++) {
int t = c[i]; c[i] = sum; sum += t; }
for (i = 0; i < n; i++) // shuffle the suffix array if necessary
tempSA[c[SA[i]+k < n ? RA[SA[i]+k] : 0]++] = SA[i];
for (i = 0; i < n; i++) // update the suffix array SA
SA[i] = tempSA[i];
}
void constructSA() { // this version can go up to 100000 characters
int i, k, r;
for (i = 0; i < n; i++) RA[i] = T[i]; // initial rankings
for (i = 0; i < n; i++) SA[i] = i; // initial SA: {0, 1, 2, ..., n-1}
for (k = 1; k < n; k <<= 1) { // repeat sorting process log n times
countingSort(k); // actually radix sort: sort based on the second item
countingSort(0); // then (stable) sort based on the first item
tempRA[SA[0]] = r = 0; // re-ranking; start from rank r = 0
for (i = 1; i < n; i++) // compare adjacent suffixes
tempRA[SA[i]] = // if same pair => same rank r; otherwise, increase r
(RA[SA[i]] == RA[SA[i-1]] && RA[SA[i]+k] == RA[SA[i-1]+k]) ? r : ++r;
for (i = 0; i < n; i++) // update the rank array RA
RA[i] = tempRA[i];
if (RA[SA[n-1]] == n-1) break; // nice optimization trick
}
}
//in main
n = input.length();
Arrays.fill(T, '\0');
for(int i=0; i<_n; ++i){
T[i] = input.charAt(i);
}
T[n++] = '$';
constructSA();
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
| //return (-1, -1) if not found
Pair stringMatching(String P) { // string matching in O(m log n)
int m = P.length();
int lo = 0, hi = n-1, mid = lo; // valid matching = [0..n-1]
while (lo < hi) { // find lower bound
mid = (lo + hi) / 2; // this is round down
int res = strncmp(T, SA[mid], P, m); // try to find P in suffix ’mid’
if (res >= 0) hi = mid; // prune upper half (notice the >= sign)
else lo = mid + 1; // prune lower half including mid
} // observe ‘=’ in "res >= 0" above
if (strncmp(T, SA[lo], P, m) != 0) return new Pair(-1, -1); // if not found
Pair ans = new Pair(0, 0); ans.first = lo;
lo = 0; hi = n - 1; mid = lo;
while (lo < hi) { // if lower bound is found, find upper bound
mid = (lo + hi) / 2;
int res = strncmp(T, SA[mid], P, m);
if (res > 0) hi = mid; // prune upper half
else lo = mid + 1; // prune lower half including mid
} // (notice the selected branch when res == 0)
if (strncmp(T, SA[hi], P, m) != 0) hi--; // special case
ans.second = hi;
return ans;
} // return lower/upperbound as first/second item of the pair, respectively
int strncmp(char[] str1, int startPos, String P, int num ){
for(int i=0; i<num; ++i){
char a = str1[startPos+i];
char b = P.charAt(i);
if(a != b) return (int)(a-b);
}
return 0;
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
| int[] Phi= new int[MAX_N];
int[] PLCP= new int[MAX_N];
int[] LCP = new int[MAX_N];
void computeLCP() {
int i, L;
Phi[SA[0]] = -1; // default value
for (i = 1; i < n; i++) // compute Phi in O(n)
Phi[SA[i]] = SA[i-1]; // remember which suffix is behind this suffix
for (i = L = 0; i < n; i++) { // compute Permuted LCP in O(n)
if (Phi[i] == -1) { PLCP[i] = 0; continue; } // special case
while (T[i + L] == T[Phi[i] + L]) L++; // L increased max n times
PLCP[i] = L;
L = Math.max(L-1, 0); // L decreased max n times
}
for (i = 0; i < n; i++) // compute LCP in O(n)
LCP[i] = PLCP[SA[i]]; // put the permuted LCP to the correct position
}
|
- Longest common substring
Time complexity: O(n)
for T1= GATAGACA and T2=CATA, concat both string to become GATAGACA$CATA#.
Go through the suffix array to see if 2 consecutive suffix belongs to 2 different word.
Refer:
- Longest repeating substring
Time complexity: O(n)
Refer: Longest Duplicate substring
Rolling hash - Rabin Karp
Time complexity: O(m+n)
Space complexity: O(n)
Time complexity is same as KMP.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
int modulo = 1_000_000_007;
private long modPow(int base, int exp){
if(exp == 0) return 1;
else if(exp == 1) return base;
long d = modPow(base, exp/2);
if(exp % 2 == 0)
return (d * d) % modulo;
else
return ((d * d) % modulo * base) % modulo;
}
//in main
long p26 = modPow(26, len-1);
//in loop
int c = s.charAt(i) - 'a';
int q = s.charAt(i-len) - 'a';
rolling = (rolling - (q * p26) % modulo + modulo) % modulo;
rolling = (rolling * 26 + c) % modulo;
|
Refer: Longest Duplicate Substring
Aho corasick
Use BFS to do automaton on Trie
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
| class TrieNode{
TrieNode[] children;
List<Integer> outIndex;
TrieNode f; //failure link, for root it is null.
public TrieNode(){
children = new TrieNode[26];
outIndex = new ArrayList<Integer>();
f = null;
}
}
//after building the trie, build the automaton.
private void buildAutomaton(){
Queue<TrieNode> q = new ArrayDeque<>();
for(int i=0; i<26; ++i){
if(root.children[i] != null){
root.children[i].f = root;
q.offer(root.children[i]);
}
}
while(!q.isEmpty()){
TrieNode u = q.poll();
for(int i= 0; i<26; ++i){
TrieNode v = u.children[i];
if(v != null){
TrieNode failure = u.f;
while(failure != null && failure.children[i] == null)
failure = failure.f;
failure = failure ==null ? root: failure.children[i];
v.f = failure;
v.outIndex.addAll(failure.outIndex);
q.offer(v);
}
}
}
}
//searching by letters
public boolean query(char letter) {
int li = letter - 'a';
while(ptr!= null && ptr.children[li] == null){
ptr = ptr.f;
}
ptr = ptr == null ? root : ptr.children[li];
return ptr.outIndex.size() > 0;
}
|
Refer: 1032. Stream of Characters
Refer: GFG