samedi 18 juin 2016

Fastest Substring Counting

This question is in reference to the LightOj problem (1255 - Substring Frequency): http://lightoj.com/volume_showproblem.php?problem=1255 [You have to login to view the problem]

Basically, the problem is a substring matching and counting problem.

Here is the KMP code:

#include <iostream>
#include <cstdio>
#include <cmath>
#include <cstdlib>
#include <vector>
#include <string>
#define MOD 1000000007
#define ull unsigned long long int
#define dll int
#define dl long int
#define ul unsigned long int
#define gc getchar_unlocked
#define cn int
using namespace std;
template <class T> void scanNum(T &x)
{
    register T c = gc();
    x = 0;
    int neg = 0;
    for(;((c<48 || c>57) && c != '-');c = gc());
    if(c=='-') {neg=1;c=gc();}
    for(;c>47 && c<58;c = gc()) {x = (x<<1) + (x<<3) + c - 48;}
    if(neg) x=-x;
}

inline void scanString(string& str)
{
    register char c = 0;
    register int i = 0;
    while (c < 33)
    c = getchar_unlocked();
    while (c != 'n')
    {
        str =str + c;
        c = getchar_unlocked();
    }
    str = str + '�';
}

class KMP
{
string txt, pat;
dll M,N,c;
dll *lps;
public:
KMP(string t,string p)
{
    txt=t;
    pat=p;
    N=txt.length()-1;
    M=pat.length()-1;
    c=0;
}
void preprocess()
{

    dll len=0,i=1;
    lps=(dll *)malloc(M*sizeof(dll));
    lps[0]=0;

    while(i<M)
    {
        if(pat[i]==pat[len])
        {
            lps[i++]=++len; 
        }
        else
        {
            if(len!=0)
            {
                len=lps[len-1];
            }
            else
            {
                lps[i++]=0;
            }
        }
    }
}

dll KMPalgo()
{
    preprocess();
    dll j=0,i=0;
    while(i<N)
    {
        if(pat[j]==txt[i])
        {
            i++;
            j++;
        }
        if(j==M)
        {
            c++;
            j=lps[j-1];
        }
        else if((i<N) && (pat[j]!=txt[i]))
        {
            if(j!=0)
                j=lps[j-1];
            else
                i++;
        }
    }
    free(lps);
    return c;
}
};


int main()
{
cn t;
scanNum<cn>(t);
for(cn i=1;i<=t;i++)
{
    string txt,pat;
    scanString(txt);
    scanString(pat);
    KMP strMatch(txt,pat);
    cn v = strMatch.KMPalgo();
    printf("Case %d: %dn",i,v);
        //nl;
}
return 0;
}

Here is the Z-algorithm code:

#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <string>
#include <algorithm>
#define ull unsigned long long int
#define dll long long int
#define dl long int
#define ul unsigned long int
#define gc getchar_unlocked
#define cn dll
#define nl printf("n");
using namespace std;

template <class T> void scanNum(T &x)
{
register T c = gc();
x = 0;
int neg = 0;
for(;((c<48 || c>57) && c != '-');c = gc());
if(c=='-') {neg=1;c=gc();}
for(;c>47 && c<58;c = gc()) {x = (x<<1) + (x<<3) + c - 48;}
if(neg) x=-x;
}
inline void scanString(string& str)
{
register char c = 0;
register int i = 0;
while (c < 33)
c = getchar_unlocked();
while (c != 'n')
{
    str =str + c;
    c = getchar_unlocked();
}
str = str + '�';
}
class ZAlgo
{
string txt,pat,fin;
dll *z,patl,txtl,n;
public:
ZAlgo(string txt,string pat)
{
    this->txt=txt;
    this->pat=pat;
    fin=pat+"@"+txt;
    patl=pat.length()-1;//cout<<patl<<endl;
    txtl=txt.length()-1;//cout<<txtl<<endl;
    n=patl+txtl+1;//cout<<fin<<" "<<fin.length()-1<<endl;
}
dll run()
{
    dll c=0;
    z=(dll *)malloc((n+1)*sizeof(dll));
    dll r=0,l=0;
    z[0]=0;
    for(dll k=1;k<n+1;k++)
    {
        if(k>r)
        {
            l=r=k;
            while((r<n+1)&&fin[r]==fin[r-l])
            {
                r++;
            }
            z[k]=(r--)-l;
        }
        else
        {
            dll k1=k-l;
            if(z[k1]<r-k+1)
            {
                z[k]=z[k1];
            }
            else
            {
                l=k;
                while((r<n+1)&&fin[r]==fin[r-l])
                {
                    r++;
                }
                z[k]=(r--)-l;
            }
        }
        if(z[k]==patl)
            c++;
    }
    return c;
}
};
int main()
{
dll t;
scanNum<dll>(t);
for (dll i=1;i<=t;i++)
{
    string txt;
    string pat;
    scanString(txt);
    scanString(pat);
    ZAlgo o(txt,pat);
    printf("Case %lld: %lldn",i,o.run());

}
return 0;
}

Both the solutions are giving TLE in the website.

Aho-Corsick algo can be used when there is a dictionary of words. Boyer-Moore has complexity of O(mn) in worst case i.e., case of something like txt=aaaaaaaaa and pat=aa which is present in the test cases of the problem.

Is there a better algorithm which can solve the problem? I couldn't find any proper solution, so I had to post it here.

Aucun commentaire:

Enregistrer un commentaire