Introduction
模版中涉及了若干个变量
- array ss[] : 待处理的字符串
- pos : 表示ss[pos],这个下标,通常指ss[pos->...]这个suffix
- array rank[pos] : suffix ss[pos->...] 排第几。
- array sa[k] or Ith[k] : 排名第k-th的suffix在哪(指的是ss[Ith[k]->...]这个后缀)
- array h[k] : 排名第k的suffix,和排名第k-1的suffix,的Longest Common Prefix.
使用类似radix sort方法,时间复杂度O(nlogn).
计算sa和rank时,初始化先把单个char用rank[]表示,然后每次for loop时: - 上一次的sa和rank已知
- 使用上一次的sa按part2顺序push stack
- 计算cnt[] -> pop and generate new Ith[] -> 按sa顺序去重 计算ranknew[]
- overwrite ranknew[] to rank[].
Template
#include <iostream>
#include <iomanip>
#include <cstring>
#include <stack>
using namespace std;
const int MAXN=200; // larger than char.size() to finish initialize.
int _array1[MAXN],_array2[MAXN];
/*
rank[pos] = k-th. ==> the rank of this suffix is k-th.
Ith[i] = pos. ==> the i-th suffix is [pos...n]
h[k-th] = len. ==> the same length of (i)th suffix and (i-1)th suffix.
*/
int *rnk=_array1, *ranknew=_array2,Ith[MAXN],h[MAXN], n;
char *ss;
void _debugRank() {
cout<<"\t";
for (int i=0; i<n; i++)
cout<<setw(2)<<ss[i]<<" ";
cout<<endl<<"\t";
for (int i=0; i<n; i++)
cout<<setw(2)<<rnk[i]<<" ";
cout<<endl;
}
void _debugIth (int skip=MAXN) {
cout<<"**Debug Ith[]"<<endl;
for (int i=0; i<n; i++) {
cout<<"\t"<<std::right<<setw(2)<<i<<"-th h="<<std::left<<setw(2)<<h[i]<<" ss["<<setw(2)<<Ith[i]<<"]: ";
for (int k=Ith[i]; k<n && k<Ith[i]+skip; k++)
cout<<ss[k];
cout<<endl;
}
}
void writeRank () {
int *tmp=rnk; rnk=ranknew; ranknew=tmp;
cout<<"-- Generate a new Rank[]:"<<endl;
_debugRank();
}
void geneSA () {
//initialization
int initCnt[128]; memset(initCnt, 0, sizeof(initCnt));
for (int pos=0; pos<n; pos++)
initCnt[ss[pos]]++;
for (int i=0; i<127; i++)
initCnt[i+1] += initCnt[i];
for (int i=0; i<n; i++)
Ith[ --initCnt[ss[i]] ] = i;
rnk[Ith[0]]=0;
for (int i=1; i<n; i++)
if (ss[Ith[i]] == ss[Ith[i-1]])
rnk[Ith[i]] = rnk[Ith[i-1]];
else
rnk[Ith[i]] = rnk[Ith[i-1]]+1;
_debugRank();
//The first part : ss[i .. i+skip-1]
//The second part : ss[i+skip .. i+skip+skip-1]
//These two parts were presented with rank[i] and rank[i+skip].
for (int skip=1; skip<n; skip*=2) {
//Radix sort. Push by the sequence of second part.
stack<int> st;
for (int i=n-skip; i<n; i++) //cuz the second part of these combination is NULL.
st.push(i);
for (int i=0; i<n; i++) //Ith (also sa[]).
if (Ith[i]-skip>=0)
st.push( Ith[i]-skip );
//generate the ranknew[];
int cnt[MAXN]; memset(cnt, 0, sizeof(cnt));
for (int pos=0; pos<n; pos++)
cnt[ rnk[pos] ]++;
for (int i=0; i+1<n; i++)
cnt[i+1]+=cnt[i];
while (!st.empty()) {
int pos = st.top(); st.pop();
Ith[ --cnt[rnk[pos]] ] = pos;
}
//regenerate the rank for the combination that have same rank[].
ranknew[ Ith[0] ]=0;
for (int i=1, id=0; i<n; i++) {
int cmp1= (Ith[i]+skip)>=n? -1:rnk[Ith[i]+skip];
int cmp2= (Ith[i-1]+skip)>=n? -1:rnk[Ith[i-1]+skip];
if (rnk[Ith[i]]==rnk[Ith[i-1]] && cmp1 == cmp2)
ranknew[Ith[i]] = ranknew[Ith[i-1]];
else
ranknew[Ith[i]] = ranknew[Ith[i-1]]+1;
}
writeRank();
//_debugIth(skip);
}
}
void geneHeight() {
h[0]=0; //the minimum string has 0 length of common prefix than (-1)th smaller string;
//for suffix s[pos...n], we assume that it's k-th suffix string,
//so Ith[k]=pos and rank[pos]=k.
//firstly we calculate the common prefix length of s[pos->n] (k-th) and the (k-1)th suffix.
//we assume this length is 'varsame' and set variable lastpos = Ith[k-1];
//When we drop the first char of this common prefix,
//we could found that, for the position (pos+1), we know it's ranking is rank[pos+1],
//and the common prefix length of ss[pos+1->n] and ss[lastpos+1->n] is at least 'varsame'-1
for (int pos=0,same=0; pos<n; pos++) {
if (rnk[pos] == 0)
continue;
int lastPos = Ith[rnk[pos]-1];
while (ss[pos+same] == ss[lastPos+same])
same++;
h[rnk[pos]]=same;
if (same>0)
same--;
}
_debugIth();
}
int main() {
char str[] = "ababcRP++";
ss=str;
n=strlen(ss);
geneSA();
geneHeight();
return 0;
}
Q:A
Q1: 在上述代码段中,更新cnt数组(用于rank计数用的数组),在初始化和迭代两部分中有重复,能否省略一下。
A: 當然囖,可以用lambda function整理一下上述代碼。
int cnt[MAXN];
auto genecnt = [&]() {
memset(cnt, 0, sizeof(cnt));
for (int i=0; i<n; i++)
cnt[rank[i]]++;
for (int i=0; i+1<MAXN; i++)
cnt[i+1] += cnt[i];
};
Q2: 上述代码geneSA()中,在for里面更新SA数组非要用stack吗?我用土办法行吗?
A: 當然可以,假如我們需要用Ith數組(sa數組)直接製造新的sa數組,那麼我們須要指定一個新的空間用以存放newIth[] (newsa[]) 下文newsa即上文樣例的Ith數組。
for (int idx : sa[]) {
//for combination : {rank[idx-skip...idx-1], rank[idx...idx+skip-1]}
if (idx-skip >= 0)
newsa[ --cnt[rank[idx-skip]] ] = idx-skip;
}
for (int idx1=n-skip; idx1<n; idx1++) //{part1=rank[idx1...idx+skip-1], part2=NULL}
newsa[ --cnt[rank[idx1]] ] = idx1;
swap(sa, newsa);
Q3: ith数组是什么,和其他帖子里写到的sa数组有何区别?
A: 一样的,为了读者易懂把名字换成了ith,自己还是习惯写sa。我是从一个noi集训队论文里面学的,论文带的标程里面就用的sa命名,当时学习的时候怕乱也就延续了下来。
贴一个链接吧 https://github.com/enkerewpo/OI-Public-Library/blob/master/%E5%9B%BD%E5%AE%B6%E9%9B%86%E8%AE%AD%E9%98%9F%E8%AE%BA%E6%96%871999-2017/2004/%E8%AE%B8%E6%99%BA%E7%A3%8A.pdf