Suffix array 后缀数组

@vrqq  November 24, 2017

Introduction

模版中涉及了若干个变量

  • array ss[] : 待处理的字符串
  • pos : 表示ss[pos],这个下标,通常指ss[pos->...]这个suffix
  • array rank[pos] : suffix ss[pos->...] 排第几。
  • array sa[k] or Ith[k] : 排名第k-th的suffix在哪(指的是ss[Ith[k]->...]这个后缀)
  • array h[k] : 排名第k的suffix,和排名第k-1的suffix,的Longest Common Prefix.
    使用类似radix sort方法,时间复杂度O(nlogn).
    计算sa和rank时,初始化先把单个char用rank[]表示,然后每次for loop时:
  • 上一次的sa和rank已知
  • 使用上一次的sa按part2顺序push stack
  • 计算cnt[] -> pop and generate new Ith[] -> 按sa顺序去重 计算ranknew[]
  • overwrite ranknew[] to rank[].

Template

#include <iostream>
#include <iomanip>
#include <cstring>
#include <stack>
using namespace std;

const int MAXN=200; // larger than char.size() to finish initialize.
int _array1[MAXN],_array2[MAXN];
/*
    rank[pos] = k-th.  ==> the rank of this suffix is k-th.
       Ith[i] = pos.   ==> the i-th suffix is [pos...n]
      h[k-th] = len.   ==> the same length of (i)th suffix and (i-1)th suffix.
 */
int *rnk=_array1, *ranknew=_array2,Ith[MAXN],h[MAXN], n;
char *ss;

void _debugRank() {
    cout<<"\t";
    for (int i=0; i<n; i++)
        cout<<setw(2)<<ss[i]<<" ";
    cout<<endl<<"\t";
    for (int i=0; i<n; i++)
        cout<<setw(2)<<rnk[i]<<" ";
    cout<<endl;
}
void _debugIth (int skip=MAXN) {
    cout<<"**Debug Ith[]"<<endl;
    for (int i=0; i<n; i++) {
        cout<<"\t"<<std::right<<setw(2)<<i<<"-th h="<<std::left<<setw(2)<<h[i]<<"  ss["<<setw(2)<<Ith[i]<<"]: ";
        for (int k=Ith[i]; k<n && k<Ith[i]+skip; k++)
            cout<<ss[k];
        cout<<endl;
    }
}
void writeRank () {
    int *tmp=rnk; rnk=ranknew; ranknew=tmp;
    cout<<"-- Generate a new Rank[]:"<<endl;
    _debugRank();
}

void geneSA () {
    //initialization
    int initCnt[128]; memset(initCnt, 0, sizeof(initCnt));
    for (int pos=0; pos<n; pos++)
        initCnt[ss[pos]]++;
    for (int i=0; i<127; i++)
        initCnt[i+1] += initCnt[i];
    for (int i=0; i<n; i++)
        Ith[ --initCnt[ss[i]] ] = i;
    rnk[Ith[0]]=0;
    for (int i=1; i<n; i++)
        if (ss[Ith[i]] == ss[Ith[i-1]])
            rnk[Ith[i]] = rnk[Ith[i-1]];
        else
            rnk[Ith[i]] = rnk[Ith[i-1]]+1;
    _debugRank();
    //The first part : ss[i .. i+skip-1]
    //The second part : ss[i+skip .. i+skip+skip-1]
    //These two parts were presented with rank[i] and rank[i+skip].
    for (int skip=1; skip<n; skip*=2) {
        //Radix sort. Push by the sequence of second part.
        stack<int> st;
        for (int i=n-skip; i<n; i++) //cuz the second part of these combination is NULL.
            st.push(i);
        for (int i=0; i<n; i++) //Ith (also sa[]).
            if (Ith[i]-skip>=0)
                st.push( Ith[i]-skip );
        //generate the ranknew[];
        int cnt[MAXN]; memset(cnt, 0, sizeof(cnt));
        for (int pos=0; pos<n; pos++)
            cnt[ rnk[pos] ]++;
        for (int i=0; i+1<n; i++)
            cnt[i+1]+=cnt[i];
        while (!st.empty()) {
            int pos = st.top(); st.pop();
            Ith[ --cnt[rnk[pos]] ] = pos;
        }
        //regenerate the rank for the combination that have same rank[].
        ranknew[ Ith[0] ]=0;
        for (int i=1, id=0; i<n; i++) {
            int cmp1= (Ith[i]+skip)>=n? -1:rnk[Ith[i]+skip];
            int cmp2= (Ith[i-1]+skip)>=n? -1:rnk[Ith[i-1]+skip];
            if (rnk[Ith[i]]==rnk[Ith[i-1]] && cmp1 == cmp2)
                ranknew[Ith[i]] = ranknew[Ith[i-1]];
            else
                ranknew[Ith[i]] = ranknew[Ith[i-1]]+1;
        }
        writeRank();
        //_debugIth(skip);
    }
}

void geneHeight() {
    h[0]=0; //the minimum string has 0 length of common prefix than (-1)th smaller string;
    //for suffix s[pos...n], we assume that it's k-th suffix string,
    //so Ith[k]=pos and rank[pos]=k.
    //firstly we calculate the common prefix length of s[pos->n] (k-th) and the (k-1)th suffix.
    //we assume this length is 'varsame' and set variable lastpos = Ith[k-1];
    //When we drop the first char of this common prefix,
    //we could found that, for the position (pos+1), we know it's ranking is rank[pos+1],
    //and the common prefix length of ss[pos+1->n] and ss[lastpos+1->n] is at least 'varsame'-1
    for (int pos=0,same=0; pos<n; pos++) {
        if (rnk[pos] == 0)
            continue;
        int lastPos = Ith[rnk[pos]-1];
        while (ss[pos+same] == ss[lastPos+same])
            same++;
        h[rnk[pos]]=same;
        if (same>0)
            same--;
    }
    _debugIth();
}
int main() {
    char str[] = "ababcRP++";
    ss=str;
    n=strlen(ss);
    geneSA();
    geneHeight();
    return 0;
}

Q:A

Q1: 在上述代码段中,更新cnt数组(用于rank计数用的数组),在初始化和迭代两部分中有重复,能否省略一下。
A: 當然囖,可以用lambda function整理一下上述代碼。

int cnt[MAXN];
auto genecnt = [&]() {
    memset(cnt, 0, sizeof(cnt));
    for (int i=0; i<n; i++)
        cnt[rank[i]]++;
    for (int i=0; i+1<MAXN; i++)
        cnt[i+1] += cnt[i];
};

Q2: 上述代码geneSA()中,在for里面更新SA数组非要用stack吗?我用土办法行吗?
A: 當然可以,假如我們需要用Ith數組(sa數組)直接製造新的sa數組,那麼我們須要指定一個新的空間用以存放newIth[] (newsa[]) 下文newsa即上文樣例的Ith數組。

for (int idx : sa[]) {
    //for combination : {rank[idx-skip...idx-1], rank[idx...idx+skip-1]}
    if (idx-skip >= 0)
        newsa[ --cnt[rank[idx-skip]] ] = idx-skip;
}
for (int idx1=n-skip; idx1<n; idx1++) //{part1=rank[idx1...idx+skip-1], part2=NULL}
    newsa[ --cnt[rank[idx1]] ] = idx1;
swap(sa, newsa);

Q3: ith数组是什么,和其他帖子里写到的sa数组有何区别?
A: 一样的,为了读者易懂把名字换成了ith,自己还是习惯写sa。我是从一个noi集训队论文里面学的,论文带的标程里面就用的sa命名,当时学习的时候怕乱也就延续了下来。
贴一个链接吧 https://github.com/enkerewpo/OI-Public-Library/blob/master/%E5%9B%BD%E5%AE%B6%E9%9B%86%E8%AE%AD%E9%98%9F%E8%AE%BA%E6%96%871999-2017/2004/%E8%AE%B8%E6%99%BA%E7%A3%8A.pdf


添加新评论