2928 lines
71 KiB
C++
2928 lines
71 KiB
C++
|
|
/*
|
||
|
|
----
|
||
|
|
This file is part of SECONDO.
|
||
|
|
|
||
|
|
Copyright (C) 2009, University in Hagen, Faculty of Mathematics and
|
||
|
|
Computer Science, Database Systems for New Applications.
|
||
|
|
|
||
|
|
SECONDO is free software; you can redistribute it and/or modify
|
||
|
|
it under the terms of the GNU General Public License as published
|
||
|
|
by
|
||
|
|
the Free Software Foundation; either version 2 of the License, or
|
||
|
|
(at your option) any later version.
|
||
|
|
|
||
|
|
SECONDO is distributed in the hope that it will be useful,
|
||
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
|
GNU General Public License for more details.
|
||
|
|
|
||
|
|
You should have received a copy of the GNU General Public License
|
||
|
|
along with SECONDO; if not, write to the Free Software
|
||
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||
|
|
----
|
||
|
|
|
||
|
|
01590 Fachpraktikum "Erweiterbare Datenbanksysteme"
|
||
|
|
WS 2011 / 2012
|
||
|
|
|
||
|
|
Svenja Fuhs
|
||
|
|
Regine Karg
|
||
|
|
Jan Kristof Nidzwetzki
|
||
|
|
Michael Teutsch
|
||
|
|
C[ue]neyt Uysal
|
||
|
|
|
||
|
|
|
||
|
|
//paragraph [1] Title: [{\Large \bf \begin{center}] [\end{center}}]
|
||
|
|
//paragraph [10] Footnote: [{\footnote{] [}}]
|
||
|
|
//[TOC] [\tableofcontents]
|
||
|
|
|
||
|
|
[TOC]
|
||
|
|
|
||
|
|
0 Overview
|
||
|
|
|
||
|
|
1 Includes and defines
|
||
|
|
|
||
|
|
*/
|
||
|
|
#include <cstdlib>
|
||
|
|
#include <queue>
|
||
|
|
#include <map>
|
||
|
|
|
||
|
|
#include <string>
|
||
|
|
|
||
|
|
#include "Algebra.h"
|
||
|
|
#include "NestedList.h"
|
||
|
|
#include "QueryProcessor.h"
|
||
|
|
#include "AlgebraManager.h"
|
||
|
|
#include "ListUtils.h"
|
||
|
|
#include "StandardTypes.h"
|
||
|
|
#include "Algebras/Relation-C++/RelationAlgebra.h"
|
||
|
|
#include "Attribute.h"
|
||
|
|
#include "Tools/Flob/Flob.h"
|
||
|
|
#include "SuffixTreeAlgebra.h"
|
||
|
|
#include "Algebras/FText/FTextAlgebra.h"
|
||
|
|
#include "Stream.h"
|
||
|
|
|
||
|
|
extern NestedList* nl;
|
||
|
|
extern QueryProcessor *qp;
|
||
|
|
extern AlgebraManager *am;
|
||
|
|
|
||
|
|
using namespace std;
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
~A Macro useful for debugging ~
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
//#define __TRACE__ cout << __FILE__ << "@" << __LINE__ << endl;
|
||
|
|
#define __TRACE__
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
//namespace to avoid name conflicts
|
||
|
|
namespace sta
|
||
|
|
{
|
||
|
|
/*
|
||
|
|
2.1 Constructor ~SuffixTree~
|
||
|
|
|
||
|
|
*/
|
||
|
|
SuffixTree::SuffixTree(SuffixTreeVertex* rootVertex) :
|
||
|
|
Attribute(true), mSuffixTree(0), mSuffixIndex(0), mInput(0), mMemoryTree(0)
|
||
|
|
{
|
||
|
|
SaveToPersistent(rootVertex);
|
||
|
|
}
|
||
|
|
|
||
|
|
SuffixTree::SuffixTree(bool def) :
|
||
|
|
Attribute(def), mSuffixTree(0), mSuffixIndex(0), mInput(0), mMemoryTree(0)
|
||
|
|
{
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
2.2 Destructor ~SuffixTree~
|
||
|
|
|
||
|
|
*/
|
||
|
|
SuffixTree::~SuffixTree()
|
||
|
|
{
|
||
|
|
if(mMemoryTree != NULL) {
|
||
|
|
delete mMemoryTree;
|
||
|
|
mMemoryTree = NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
2.4 Destructor ~SuffixTree~
|
||
|
|
|
||
|
|
*/
|
||
|
|
SuffixTreeVertex* SuffixTree::GetInMemoryTree()
|
||
|
|
{
|
||
|
|
if( mMemoryTree == NULL) {
|
||
|
|
mMemoryTree = SuffixTree::LoadFromPersistent();
|
||
|
|
}
|
||
|
|
|
||
|
|
return mMemoryTree;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
2.3 Equal Function
|
||
|
|
To compare two SuffixTrees.
|
||
|
|
The result is true if the text of two SuffixTrees is exactly the same.
|
||
|
|
|
||
|
|
*/
|
||
|
|
bool SuffixTree::Equal(const Attribute* arg) const
|
||
|
|
{
|
||
|
|
bool res = false;
|
||
|
|
const SuffixTree* argumentTree =
|
||
|
|
static_cast<const SuffixTree*> (arg);
|
||
|
|
|
||
|
|
if (!IsDefined() || !argumentTree->IsDefined())
|
||
|
|
{
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
//get the flob of the suffixtree argument which has the text
|
||
|
|
const Flob* textFlobArgument = &mInput;
|
||
|
|
size_t flobLengthArgument = textFlobArgument->getSize();
|
||
|
|
//read the flob and put the characters into an char array
|
||
|
|
char *charArrayArgument = (char *) malloc(flobLengthArgument);
|
||
|
|
textFlobArgument->read(charArrayArgument, flobLengthArgument, 0);
|
||
|
|
|
||
|
|
// current suffixtree
|
||
|
|
size_t flobLengthCurrent = mInput.getSize();
|
||
|
|
|
||
|
|
char *charArrayCurrent = (char *) malloc(flobLengthCurrent);
|
||
|
|
mInput.read(charArrayCurrent, flobLengthCurrent, 0);
|
||
|
|
|
||
|
|
//if both text legths are equal compare them by characters
|
||
|
|
if (flobLengthArgument == flobLengthCurrent)
|
||
|
|
{
|
||
|
|
for (size_t pos = 0; pos < flobLengthArgument - 1; pos++)
|
||
|
|
{
|
||
|
|
if (*(charArrayArgument + pos) == *(charArrayCurrent + pos))
|
||
|
|
{
|
||
|
|
res = true;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
res = false;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
free(charArrayArgument);
|
||
|
|
free(charArrayCurrent);
|
||
|
|
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
3. Attribute functions
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
/*
|
||
|
|
3.1 ~HashValue~ function
|
||
|
|
Using the Algorithm idea from Robert Sedgwicks.
|
||
|
|
|
||
|
|
*/
|
||
|
|
size_t SuffixTree::HashValue() const
|
||
|
|
{
|
||
|
|
unsigned int b = 378551;
|
||
|
|
unsigned int a = 63689;
|
||
|
|
unsigned int hash = 0;
|
||
|
|
|
||
|
|
if (!IsDefined())
|
||
|
|
{
|
||
|
|
return hash;
|
||
|
|
}
|
||
|
|
size_t textLength = mInput.getSize();
|
||
|
|
char *charArray = (char *) malloc(textLength);
|
||
|
|
mInput.read(charArray, textLength, 0);
|
||
|
|
|
||
|
|
for (size_t pos = 0; pos < textLength - 1; pos++)
|
||
|
|
{
|
||
|
|
hash = hash * a + *(charArray + pos);
|
||
|
|
a = a * b;
|
||
|
|
}
|
||
|
|
|
||
|
|
free(charArray);
|
||
|
|
return hash;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
3.2 Copy
|
||
|
|
|
||
|
|
*/
|
||
|
|
void SuffixTree::CopyFrom(const Attribute* arg)
|
||
|
|
{
|
||
|
|
const SuffixTree *st = static_cast<const SuffixTree*> (arg);
|
||
|
|
SetDefined(st->IsDefined());
|
||
|
|
this->mInput.copyFrom(st->mInput);
|
||
|
|
this->mSuffixTree.copyFrom(st->mSuffixTree);
|
||
|
|
this->mSuffixIndex.copyFrom(st->mSuffixIndex);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
3.3 ~Compare~ function
|
||
|
|
Compares two SuffixTrees
|
||
|
|
Defines an order on SuffixTrees: shorter text is smaller than longer text,
|
||
|
|
if both texts have the same length, the text which is lexicographical first
|
||
|
|
is the smaller one.
|
||
|
|
|
||
|
|
*/
|
||
|
|
int SuffixTree::Compare(const Attribute* arg) const
|
||
|
|
{
|
||
|
|
int result = 0;
|
||
|
|
const SuffixTree* constArgumentSuffixTree =
|
||
|
|
static_cast<const SuffixTree*> (arg);
|
||
|
|
//const_cast to be able to use GetFLOB() function
|
||
|
|
SuffixTree* argumentSuffixTree =
|
||
|
|
const_cast<SuffixTree*> (constArgumentSuffixTree);
|
||
|
|
|
||
|
|
if (!IsDefined())
|
||
|
|
{
|
||
|
|
if (argumentSuffixTree->IsDefined())
|
||
|
|
{
|
||
|
|
result = -1;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
result = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (argumentSuffixTree->IsDefined())
|
||
|
|
{
|
||
|
|
result = 1;
|
||
|
|
}
|
||
|
|
// both are defined
|
||
|
|
//both are equal
|
||
|
|
if (Equal(argumentSuffixTree))
|
||
|
|
{
|
||
|
|
result = 0;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
//not equal, char Values equal
|
||
|
|
//the shorter text is smaller
|
||
|
|
|
||
|
|
//argument SuffixTree
|
||
|
|
Flob* argumentTextFlob = argumentSuffixTree->GetFLOB(2);
|
||
|
|
size_t argumentFlobLength = argumentTextFlob->getSize();
|
||
|
|
char *charArrayArgument = (char *) malloc(argumentFlobLength);
|
||
|
|
argumentTextFlob->read(charArrayArgument, argumentFlobLength, 0);
|
||
|
|
|
||
|
|
// current SuffixTree
|
||
|
|
size_t currentFlobLength = mInput.getSize();
|
||
|
|
char *charArrayCurrent = (char *) malloc(currentFlobLength);
|
||
|
|
mInput.read(charArrayCurrent, currentFlobLength, 0);
|
||
|
|
//add all char Values as integers
|
||
|
|
|
||
|
|
if (currentFlobLength < argumentFlobLength)
|
||
|
|
{
|
||
|
|
result = -1;
|
||
|
|
}
|
||
|
|
else if (currentFlobLength > argumentFlobLength)
|
||
|
|
{
|
||
|
|
result = 1;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
//both text lengths are equal
|
||
|
|
//lexicographical
|
||
|
|
for (size_t pos = 0; pos < currentFlobLength; pos++)
|
||
|
|
{
|
||
|
|
if (*(charArrayCurrent + pos) != *(charArrayArgument + pos))
|
||
|
|
{
|
||
|
|
if (*(charArrayCurrent + pos) < *(charArrayArgument + pos))
|
||
|
|
{
|
||
|
|
result = -1;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
result = 1;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
free(charArrayCurrent);
|
||
|
|
free(charArrayArgument);
|
||
|
|
}
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
3.4 ~Adjacent~ function
|
||
|
|
|
||
|
|
*/
|
||
|
|
bool SuffixTree::Adjacent(const Attribute * arg) const
|
||
|
|
{
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
3.5 ~Sizeof~ functions
|
||
|
|
|
||
|
|
*/
|
||
|
|
size_t SuffixTree::Sizeof() const
|
||
|
|
{
|
||
|
|
return sizeof(*this);
|
||
|
|
}
|
||
|
|
|
||
|
|
int SuffixTree::SizeOfSuffixTree()
|
||
|
|
{
|
||
|
|
return sizeof(SuffixTree);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
3.6 ~Clone~ functions
|
||
|
|
|
||
|
|
Returns a new created SuffixTree (clone) which is a
|
||
|
|
copy of ~this~.
|
||
|
|
|
||
|
|
*/
|
||
|
|
SuffixTree* SuffixTree::Clone() const
|
||
|
|
{
|
||
|
|
SuffixTree *newSuffixTree = new SuffixTree(true);
|
||
|
|
newSuffixTree->CopyFrom(this);
|
||
|
|
return newSuffixTree;
|
||
|
|
}
|
||
|
|
|
||
|
|
Word SuffixTree::CloneSuffixTree(const ListExpr typeInfo, const Word& w)
|
||
|
|
{
|
||
|
|
return SetWord(((SuffixTree*) w.addr)->Clone());
|
||
|
|
|
||
|
|
}
|
||
|
|
/*
|
||
|
|
3.7 ~Cast~ function
|
||
|
|
|
||
|
|
*/
|
||
|
|
void* SuffixTree::Cast(void* addr)
|
||
|
|
{
|
||
|
|
return new (addr) SuffixTree;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
3.8 ~NumOfFlobs~
|
||
|
|
|
||
|
|
*/
|
||
|
|
inline int SuffixTree::NumOfFLOBs() const
|
||
|
|
{
|
||
|
|
return 3;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
3.9 GetFlob
|
||
|
|
|
||
|
|
*/
|
||
|
|
inline Flob *SuffixTree::GetFLOB(const int i)
|
||
|
|
{
|
||
|
|
assert( i >= 0 && i < NumOfFLOBs() );
|
||
|
|
|
||
|
|
if (i == 0)
|
||
|
|
{
|
||
|
|
return &mSuffixTree;
|
||
|
|
} else if(i == 1) {
|
||
|
|
return &mSuffixIndex;
|
||
|
|
}
|
||
|
|
|
||
|
|
return &mInput;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
4. Additional functions
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
/*
|
||
|
|
4.1 Print
|
||
|
|
|
||
|
|
*/
|
||
|
|
ostream& SuffixTree::Print(ostream &os) const
|
||
|
|
{
|
||
|
|
os << "SuffixTree: ";
|
||
|
|
if (IsDefined())
|
||
|
|
{
|
||
|
|
os << "DEFINED" << endl;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
os << "UNDEFINED." << endl;
|
||
|
|
}
|
||
|
|
return os;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
5. List Representation
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
/*
|
||
|
|
5.1 ~Out~-Function
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr SuffixTree::Out(ListExpr typeInfo, Word value)
|
||
|
|
{
|
||
|
|
SuffixTree* suffixtree = static_cast<SuffixTree*> (value.addr);
|
||
|
|
ListExpr result;
|
||
|
|
if (suffixtree == NULL)
|
||
|
|
{
|
||
|
|
return nl->SymbolAtom(Symbol::UNDEFINED());
|
||
|
|
}
|
||
|
|
if (!suffixtree->IsDefined())
|
||
|
|
{
|
||
|
|
result = nl->SymbolAtom(Symbol::UNDEFINED());
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
SuffixTreeVertex *vertex = suffixtree -> LoadFromPersistent();
|
||
|
|
ListExpr text = nl->TextAtom();
|
||
|
|
|
||
|
|
// force complete loading of our tree
|
||
|
|
vertex -> GetNumberOfLeaves();
|
||
|
|
|
||
|
|
nl->AppendText(text, *(vertex -> GetInput()));
|
||
|
|
ListExpr tree = SuffixTreeVertex::CreateListExprFromSuffixtree(vertex);
|
||
|
|
result = nl->TwoElemList(tree, text);
|
||
|
|
delete vertex;
|
||
|
|
}
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
5.2 ~In~-Function
|
||
|
|
|
||
|
|
*/
|
||
|
|
Word SuffixTree::In(const ListExpr typeInfo, const ListExpr instance,
|
||
|
|
const int errorPos, ListExpr& errorInfo, bool& correct)
|
||
|
|
{
|
||
|
|
Word w = SetWord(Address(0));
|
||
|
|
SuffixTree* suffixtree = new SuffixTree(false);
|
||
|
|
correct = false;
|
||
|
|
if (nl->ListLength(instance) == 2)
|
||
|
|
{
|
||
|
|
ListExpr treeLE = nl->First(instance);
|
||
|
|
ListExpr text = nl->Second(instance);
|
||
|
|
Word t = ftext::InFText(typeInfo, text, errorPos, errorInfo, correct);
|
||
|
|
FText* newFText = static_cast<FText*> (t.addr);
|
||
|
|
if (newFText->IsDefined())
|
||
|
|
{
|
||
|
|
try
|
||
|
|
{
|
||
|
|
// Create our RootVertex
|
||
|
|
SuffixTreeVertex* vertex =
|
||
|
|
SuffixTreeVertex::CreateSuffixtreeFromListExpr(treeLE,
|
||
|
|
new string(newFText->GetValue()));
|
||
|
|
|
||
|
|
delete suffixtree;
|
||
|
|
suffixtree = new SuffixTree(vertex);
|
||
|
|
|
||
|
|
// Delete our im memory tree
|
||
|
|
delete vertex;
|
||
|
|
vertex = NULL;
|
||
|
|
|
||
|
|
correct = true;
|
||
|
|
} catch (string& s)
|
||
|
|
{
|
||
|
|
cmsg.inFunError("The first element is not a suffixtree.");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
cmsg.inFunError("Undefined ftext");
|
||
|
|
}
|
||
|
|
|
||
|
|
// cleanup
|
||
|
|
delete newFText;
|
||
|
|
newFText = NULL;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
cmsg.inFunError(
|
||
|
|
"Expecting two elements! First a suffixtree and then a ftext");
|
||
|
|
}
|
||
|
|
w.addr = suffixtree;
|
||
|
|
return w;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
5.3 ~Create~-function
|
||
|
|
|
||
|
|
*/
|
||
|
|
Word SuffixTree::Create(const ListExpr typeInfo)
|
||
|
|
{
|
||
|
|
return SetWord(new SuffixTree(false));
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
5.4 ~KindCheck~ Kind Checking Function
|
||
|
|
|
||
|
|
This function checks whether the type constructor is applied
|
||
|
|
correctly. Since type constructor ~SuffixTree~ does not have arguments,
|
||
|
|
this is trivial.
|
||
|
|
|
||
|
|
*/
|
||
|
|
bool SuffixTree::KindCheck(ListExpr type, ListExpr& errorInfo)
|
||
|
|
{
|
||
|
|
return (nl->IsEqual(type, SuffixTree::BasicType()));
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
5.5 ~Close~-function
|
||
|
|
|
||
|
|
*/
|
||
|
|
void SuffixTree::Close(const ListExpr typeInfo, Word& w)
|
||
|
|
{
|
||
|
|
delete static_cast<SuffixTree*> (w.addr);
|
||
|
|
w.addr = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
5.6 ~Delete~-function
|
||
|
|
|
||
|
|
*/
|
||
|
|
void SuffixTree::Delete(const ListExpr typeInfo, Word& w)
|
||
|
|
{
|
||
|
|
delete static_cast<SuffixTree*> (w.addr);
|
||
|
|
w.addr = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
5.7 ~Open~-function
|
||
|
|
|
||
|
|
*/
|
||
|
|
bool SuffixTree::Open( SmiRecord& valueRecord, size_t& offset,
|
||
|
|
const ListExpr typeInfo, Word& value )
|
||
|
|
{
|
||
|
|
SuffixTree *p = (SuffixTree*)Attribute::Open( valueRecord, offset, typeInfo );
|
||
|
|
value.setAddr( p );
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
5.5 Create a persistent representation of our
|
||
|
|
transient SuffixTree
|
||
|
|
|
||
|
|
*/
|
||
|
|
void SuffixTree::SaveToPersistent(SuffixTreeVertex* vertex)
|
||
|
|
{
|
||
|
|
// persist text
|
||
|
|
mInput.clean();
|
||
|
|
const char* newString = vertex -> GetInput() -> c_str();
|
||
|
|
if (newString != NULL)
|
||
|
|
{
|
||
|
|
size_t sz = strlen(newString) + 1;
|
||
|
|
if (sz > 0)
|
||
|
|
{
|
||
|
|
assert(newString[sz-1]==0);
|
||
|
|
mInput.write(newString, sz);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
char d = 0;
|
||
|
|
mInput.write(&d, 1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Persist vertex
|
||
|
|
mSuffixTree.clean();
|
||
|
|
mSuffixIndex.clean();
|
||
|
|
|
||
|
|
// The suffixtree is processed by breadth-first-traversal. If a node has not
|
||
|
|
// yet been processed, a pointer on this node is stored in vertexQueue.
|
||
|
|
queue<SuffixTreeVertex*> vertexQueue;
|
||
|
|
SuffixTreeVertex* currentVertex;
|
||
|
|
SuffixTreeEdge* currentEdge;
|
||
|
|
|
||
|
|
size_t vertexId = 0;
|
||
|
|
size_t edgeCounter = 0;
|
||
|
|
|
||
|
|
// calculate vertexIds
|
||
|
|
vertexQueue.push(vertex);
|
||
|
|
while (!vertexQueue.empty())
|
||
|
|
{
|
||
|
|
currentVertex = vertexQueue.front();
|
||
|
|
vertexQueue.pop();
|
||
|
|
|
||
|
|
currentVertex -> SetVertexId(vertexId);
|
||
|
|
|
||
|
|
for (size_t edgeNo = 0; edgeNo < currentVertex->GetEdgeCount(); edgeNo++)
|
||
|
|
{
|
||
|
|
currentEdge = currentVertex->GetEdgeAt(edgeNo);
|
||
|
|
|
||
|
|
if (currentEdge->GetChild() != NULL)
|
||
|
|
{
|
||
|
|
vertexQueue.push(currentEdge->GetChild());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
vertexId++;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Persist data
|
||
|
|
vertexQueue.push(vertex);
|
||
|
|
while (!vertexQueue.empty())
|
||
|
|
{
|
||
|
|
currentVertex = vertexQueue.front();
|
||
|
|
vertexQueue.pop();
|
||
|
|
// For each node, first the number of outgoing edges is stored,
|
||
|
|
// followed by the respective edges' start and end values.
|
||
|
|
mSuffixTree.Append(currentVertex -> GetVertexId());
|
||
|
|
mSuffixTree.Append(currentVertex->GetEdgeCount());
|
||
|
|
|
||
|
|
for (size_t edgeNo = 0; edgeNo < currentVertex->GetEdgeCount(); edgeNo++)
|
||
|
|
{
|
||
|
|
currentEdge = currentVertex->GetEdgeAt(edgeNo);
|
||
|
|
mSuffixTree.Append(currentEdge->GetStartIndex());
|
||
|
|
mSuffixTree.Append(currentEdge->GetEndIndex());
|
||
|
|
|
||
|
|
if (currentEdge->GetChild() != NULL)
|
||
|
|
{
|
||
|
|
// The currently processed edge has a subsequent node. A pointer
|
||
|
|
// on this node is stored in vertexQueue.
|
||
|
|
vertexQueue.push(currentEdge->GetChild());
|
||
|
|
mSuffixTree.Append(currentEdge->GetChild()->GetVertexId());
|
||
|
|
} else {
|
||
|
|
// Leaf, so our vertex id is 0
|
||
|
|
mSuffixTree.Append(0);
|
||
|
|
}
|
||
|
|
|
||
|
|
edgeCounter++;
|
||
|
|
}
|
||
|
|
|
||
|
|
// build index
|
||
|
|
mSuffixIndex.Append(edgeCounter);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
5.6 Create a transient representation of our
|
||
|
|
persistent SuffixTree
|
||
|
|
|
||
|
|
*/
|
||
|
|
SuffixTreeVertex* SuffixTree::LoadFromPersistent()
|
||
|
|
{
|
||
|
|
string *OurText = new string();
|
||
|
|
OurText -> resize(mInput.getSize() + 2, '\0');
|
||
|
|
|
||
|
|
//delete[] s;
|
||
|
|
//s = NULL;
|
||
|
|
|
||
|
|
// Load only child of the root node
|
||
|
|
// the next elements are loaded on demand through our
|
||
|
|
// SuffixTreeLoader
|
||
|
|
SuffixTreeVertex *vertex = new SuffixTreeVertex(OurText);
|
||
|
|
SuffixTreeLoader *loader =
|
||
|
|
new SuffixTreeLoader(&mSuffixTree, &mSuffixIndex, &mInput);
|
||
|
|
vertex -> SetLoader(loader);
|
||
|
|
|
||
|
|
// load persisted vertex
|
||
|
|
SuffixTreeVertex* currentVertex = vertex;
|
||
|
|
size_t offset = 0, vertexId = 0, edgeNo = 0;
|
||
|
|
|
||
|
|
// offset stores the index on the next node to be processed. The
|
||
|
|
// element in mSuffixtree[offset] stores the node's number of
|
||
|
|
// outgoing edges.
|
||
|
|
mSuffixTree.Get(offset++, vertexId);
|
||
|
|
mSuffixTree.Get(offset++, edgeNo);
|
||
|
|
|
||
|
|
currentVertex -> SetVertexId(vertexId);
|
||
|
|
|
||
|
|
for (size_t edgeIndex = 0; edgeIndex < edgeNo; edgeIndex++)
|
||
|
|
{
|
||
|
|
size_t start, end, destVertexId;
|
||
|
|
//mSuffixTree.Get((int) offset + (3 * edgeIndex) + 2, start);
|
||
|
|
mSuffixTree.Get((int) offset++, start);
|
||
|
|
mSuffixTree.Get((int) offset++, end);
|
||
|
|
mSuffixTree.Get((int) offset++, destVertexId);
|
||
|
|
|
||
|
|
loader -> LoadTextForIndex(vertex, start, start);
|
||
|
|
|
||
|
|
SuffixTreeEdge* newEdge = new SuffixTreeEdge(start, end);
|
||
|
|
currentVertex->InsertEdge(newEdge);
|
||
|
|
|
||
|
|
// Inner edge?
|
||
|
|
if (destVertexId != 0)
|
||
|
|
{
|
||
|
|
SuffixTreeVertex* newVertex = new SuffixTreeVertex(OurText);
|
||
|
|
newVertex -> SetVertexId(destVertexId);
|
||
|
|
newEdge->SetChild(newVertex);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return vertex;
|
||
|
|
}
|
||
|
|
/*
|
||
|
|
5.7 ~SuffixTreeInfo~ Type Description
|
||
|
|
done by implementing a subclass of ~ConstructorInfo~.
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct SuffixTreeInfo: ConstructorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
SuffixTreeInfo()
|
||
|
|
{
|
||
|
|
//example babac$
|
||
|
|
name = SuffixTree::BasicType();
|
||
|
|
signature = "-> " + Kind::DATA();
|
||
|
|
typeExample = SuffixTree::BasicType();
|
||
|
|
listRep = "(<suffixtree><text>)";
|
||
|
|
// example text is babac
|
||
|
|
valueExample = "(((5 5 ())(1 1 (( 2 5 ())(4 5 ())))"
|
||
|
|
"(0 1 (( 2 5 ())(4 5 ())))(4 5 ()))"
|
||
|
|
"(4 5 ()))text)";
|
||
|
|
remarks = "first value has to be a SuffixTree, second value"
|
||
|
|
" from type Text.";
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
5.8 Creation of the Type Constructor Instance
|
||
|
|
|
||
|
|
template class
|
||
|
|
~ConstructorFunctions~ which will create many default implementations of
|
||
|
|
functions used by a Secondo type.
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct SuffixTreeFunctions: ConstructorFunctions<SuffixTree>
|
||
|
|
{
|
||
|
|
|
||
|
|
SuffixTreeFunctions()
|
||
|
|
{
|
||
|
|
// re-assign some function pointers
|
||
|
|
create = SuffixTree::Create;
|
||
|
|
deletion = SuffixTree::Delete;
|
||
|
|
open = SuffixTree::Open;
|
||
|
|
save = SaveAttribute<SuffixTree> ;
|
||
|
|
in = SuffixTree::In;
|
||
|
|
out = SuffixTree::Out;
|
||
|
|
close = SuffixTree::Close;
|
||
|
|
clone = SuffixTree::CloneSuffixTree;
|
||
|
|
cast = SuffixTree::Cast;
|
||
|
|
sizeOf = SuffixTree::SizeOfSuffixTree;
|
||
|
|
kindCheck = SuffixTree::KindCheck;
|
||
|
|
// the default implementations for open and save are only
|
||
|
|
// suitable for a class which is derived from class ~Attribute~, hence
|
||
|
|
// open and save functions must be overwritten here.
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
SuffixTreeInfo suffInfo;
|
||
|
|
SuffixTreeFunctions suffFunctions;
|
||
|
|
TypeConstructor SuffixTreeTC(suffInfo, suffFunctions);
|
||
|
|
|
||
|
|
/*
|
||
|
|
6 Creating Operators
|
||
|
|
|
||
|
|
6.1 Type Mapping Functions
|
||
|
|
|
||
|
|
A type mapping function checks whether the correct argument types are supplied
|
||
|
|
for an operator; if so, it returns a list expression for the result type,
|
||
|
|
otherwise the symbol ~typeerror~.
|
||
|
|
|
||
|
|
6.1.1 ~createSuffixTreeTypeMap~ type mapping for create function
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr createSuffixTreeTypeMap(ListExpr args)
|
||
|
|
{
|
||
|
|
ListExpr arg;
|
||
|
|
string nlchrs;
|
||
|
|
if (nl->ListLength(args) == 1)
|
||
|
|
{
|
||
|
|
arg = nl->First(args);
|
||
|
|
if (nl->IsEqual(arg, typeName))
|
||
|
|
{
|
||
|
|
return NList(SuffixTree::BasicType()).listExpr();
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
return NList::typeError("Expecting a text from type FText");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
return NList::typeError("One argument expected.");
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.1.2 ~patternOccursTypeMap~
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr patternOccursTypeMap(ListExpr args)
|
||
|
|
{
|
||
|
|
NList type(args);
|
||
|
|
const string errMsg = "Expecting a SuffixTree and a text from type FText ";
|
||
|
|
|
||
|
|
// SuffixTree x text -> bool
|
||
|
|
if (type == NList(SuffixTree::BasicType(), FText::BasicType()))
|
||
|
|
{
|
||
|
|
return NList(CcBool::BasicType()).listExpr();
|
||
|
|
}
|
||
|
|
return NList::typeError(errMsg);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.1.3 ~patternPositionsTypeMap~
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr patternPositionsTypeMap(ListExpr args)
|
||
|
|
{
|
||
|
|
NList type(args);
|
||
|
|
const string errMsg = "Expecting a SuffixTree and a text from type FText ";
|
||
|
|
|
||
|
|
// SuffixTree x text -> stream(int)
|
||
|
|
if (type == NList(SuffixTree::BasicType(), FText::BasicType()))
|
||
|
|
{
|
||
|
|
return nl->TwoElemList(nl->SymbolAtom(Stream<CcInt>::BasicType()),
|
||
|
|
nl->SymbolAtom(CcInt::BasicType()));
|
||
|
|
}
|
||
|
|
return NList::typeError(errMsg);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.1.4 ~patternCountTypeMap~
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr patternCountTypeMap(ListExpr args)
|
||
|
|
{
|
||
|
|
NList type(args);
|
||
|
|
const string errMsg = "Expecting a SuffixTree and a text from type FText ";
|
||
|
|
|
||
|
|
// SuffixTree x text -> int
|
||
|
|
if (type == NList(SuffixTree::BasicType(), FText::BasicType()))
|
||
|
|
{
|
||
|
|
return NList(CcInt::BasicType()).listExpr();
|
||
|
|
}
|
||
|
|
return NList::typeError(errMsg);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
|
||
|
|
6.1.5 ~longestRepeatedSubstringTypeMap~
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr suffixtree_textstreamTypeMap(ListExpr args)
|
||
|
|
{
|
||
|
|
ListExpr resultType;
|
||
|
|
if (nl->ListLength(args) == 1)
|
||
|
|
{
|
||
|
|
if (nl->IsEqual(nl->First(args), SuffixTree::BasicType()))
|
||
|
|
{
|
||
|
|
resultType = nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()),
|
||
|
|
nl->SymbolAtom(FText::BasicType()));
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
resultType = NList::typeError("Expecting a SuffixTree");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
resultType = NList::typeError("One argument expected.");
|
||
|
|
}
|
||
|
|
return resultType;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.1.7 ~longestCommonSubstringTypeMap~
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr longestCommonSubstringTypeMap(ListExpr args)
|
||
|
|
{
|
||
|
|
ListExpr resultType;
|
||
|
|
if (nl->ListLength(args) == 2)
|
||
|
|
{
|
||
|
|
if (nl->IsEqual(nl->First(args), FText::BasicType()) &&
|
||
|
|
nl->IsEqual(nl->Second(args), FText::BasicType()))
|
||
|
|
{
|
||
|
|
resultType = nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()),
|
||
|
|
nl->SymbolAtom(FText::BasicType()));
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
resultType = NList::typeError("two arguments of type FText expected.");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
resultType = NList::typeError("two arguments of type FText expected.");
|
||
|
|
}
|
||
|
|
return resultType;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.1.8 ~maximalUniqueMatchesTypeMap~
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr maximalUniqueMatchesTypeMap(ListExpr args)
|
||
|
|
{
|
||
|
|
ListExpr resultType;
|
||
|
|
if (nl->ListLength(args) == 2)
|
||
|
|
{
|
||
|
|
if (nl->IsEqual(nl->First(args), FText::BasicType()) &&
|
||
|
|
nl->IsEqual(nl->Second(args), FText::BasicType()))
|
||
|
|
{
|
||
|
|
resultType = nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()),
|
||
|
|
nl->SymbolAtom(FText::BasicType()));
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
resultType = NList::typeError("two arguments of type FText expected.");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
resultType = NList::typeError("two arguments of type FText expected.");
|
||
|
|
}
|
||
|
|
return resultType;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.1.9 ~circularStringLinearisationTypeMap~
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr circularStringLinearizationTypeMap(ListExpr args)
|
||
|
|
{
|
||
|
|
ListExpr resultType;
|
||
|
|
if (nl->ListLength(args) == 1)
|
||
|
|
{
|
||
|
|
if (nl->IsEqual(nl->First(args), FText::BasicType()))
|
||
|
|
{
|
||
|
|
|
||
|
|
|
||
|
|
// text -> stream(tuple(linstr: text, pos: int))
|
||
|
|
return nl->TwoElemList(nl->SymbolAtom(Stream<Tuple>::BasicType()),
|
||
|
|
nl->TwoElemList(nl->SymbolAtom(Tuple::BasicType()),
|
||
|
|
nl->TwoElemList(
|
||
|
|
nl->TwoElemList(nl->SymbolAtom("linstr"),
|
||
|
|
nl->SymbolAtom(FText::BasicType())),
|
||
|
|
nl->TwoElemList(nl->SymbolAtom("pos"),
|
||
|
|
nl->SymbolAtom(CcInt::BasicType()))
|
||
|
|
)));
|
||
|
|
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
resultType = NList::typeError("one argument of type FText expected.");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
resultType = NList::typeError("one argument of type FText expected.");
|
||
|
|
}
|
||
|
|
return resultType;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.1.10 ~kMismatchTypeMap~
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr kMismatchTypeMap(ListExpr args)
|
||
|
|
{
|
||
|
|
NList type(args);
|
||
|
|
const string errMsg = "Expecting a SuffixTree, a text from type FText"
|
||
|
|
"and an int";
|
||
|
|
|
||
|
|
// SuffixTree x text x int -> stream(text)
|
||
|
|
if (type == NList(SuffixTree::BasicType(),FText::BasicType(),
|
||
|
|
CcInt::BasicType()))
|
||
|
|
{
|
||
|
|
return nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()),
|
||
|
|
nl->SymbolAtom(FText::BasicType()));
|
||
|
|
}
|
||
|
|
return NList::typeError(errMsg);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.1.10 ~DeleteTerminalSymbolFromText~
|
||
|
|
|
||
|
|
*/
|
||
|
|
void DeleteTerminalSymbolFromText(string& text)
|
||
|
|
{
|
||
|
|
size_t pos = 0;
|
||
|
|
pos = text.find(terminationCharacter);
|
||
|
|
while (pos != string::npos)
|
||
|
|
{
|
||
|
|
text.erase(pos, 1);
|
||
|
|
pos = text.find(terminationCharacter, pos);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.2 Value Mapping Functions
|
||
|
|
|
||
|
|
6.2.1 ~createSuffixTreeFunction~ value mapping for create function
|
||
|
|
|
||
|
|
This function creates a suffixtree from a text with the Ukkonen algorithm
|
||
|
|
in linear time.
|
||
|
|
|
||
|
|
*/
|
||
|
|
int createSuffixTreeFunction(Word* args, Word& result, int message,
|
||
|
|
Word& local, Supplier s)
|
||
|
|
{
|
||
|
|
result = qp->ResultStorage(s);
|
||
|
|
|
||
|
|
FText* ftext = static_cast<FText*> (args[0].addr);
|
||
|
|
string text = ftext->GetValue();
|
||
|
|
DeleteTerminalSymbolFromText(text);
|
||
|
|
text = text + terminationCharacter;
|
||
|
|
|
||
|
|
SuffixTreeVertex* tempTree =
|
||
|
|
UkkonenTreeBuilder::CreateSuffixTree(new string(text));
|
||
|
|
|
||
|
|
if (tempTree != NULL)
|
||
|
|
{
|
||
|
|
SuffixTree *res = static_cast<SuffixTree*> (result.addr);
|
||
|
|
res -> SaveToPersistent(tempTree);
|
||
|
|
res -> SetDefined(true);
|
||
|
|
|
||
|
|
// Clean up
|
||
|
|
delete tempTree;
|
||
|
|
tempTree = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.2.1 ~createSuffixTreeFunction~ value mapping for create function
|
||
|
|
with quadratic running time
|
||
|
|
|
||
|
|
*/
|
||
|
|
int createSuffixTreeFunction_quadratic(Word* args, Word& result,
|
||
|
|
int message, Word& local, Supplier s)
|
||
|
|
{
|
||
|
|
result = qp->ResultStorage(s);
|
||
|
|
|
||
|
|
FText* ftext = static_cast<FText*> (args[0].addr);
|
||
|
|
string text = ftext->GetValue();
|
||
|
|
DeleteTerminalSymbolFromText(text);
|
||
|
|
text = text + terminationCharacter;
|
||
|
|
SuffixTreeVertex* tempTree =
|
||
|
|
SimpleTreeBuilder::CreateSuffixTree(new string(text));
|
||
|
|
|
||
|
|
if (tempTree != NULL)
|
||
|
|
{
|
||
|
|
SuffixTree *res = static_cast<SuffixTree*> (result.addr);
|
||
|
|
res -> SaveToPersistent(tempTree);
|
||
|
|
res -> SetDefined(true);
|
||
|
|
|
||
|
|
// Clean up
|
||
|
|
delete tempTree;
|
||
|
|
tempTree = NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.2.2 ~patternOccursFunction~
|
||
|
|
Returns true if the pattern p occurs in text s, else returns false.
|
||
|
|
The function decides the result in O(|p|).
|
||
|
|
|
||
|
|
*/
|
||
|
|
int patternOccursFunction(Word* args, Word& result, int message, Word& local,
|
||
|
|
Supplier s)
|
||
|
|
{
|
||
|
|
bool defined = true;
|
||
|
|
bool occurs = false;
|
||
|
|
|
||
|
|
SuffixTree* tree = static_cast<SuffixTree*>( args[0].addr );
|
||
|
|
|
||
|
|
FText *text = static_cast<FText*>( args[1].addr );
|
||
|
|
string searchPattern = text -> GetValue();
|
||
|
|
|
||
|
|
if( ! tree->IsDefined() || ! text->IsDefined() )
|
||
|
|
{
|
||
|
|
defined = false;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
if(searchPattern.length() > 0)
|
||
|
|
{
|
||
|
|
const SuffixTreeEdge *ourEdge = NULL;
|
||
|
|
int offset = -1;
|
||
|
|
|
||
|
|
bool patternFound = tree -> GetInMemoryTree() -> FindEdgeForSearchPattern
|
||
|
|
(searchPattern, &ourEdge, &offset);
|
||
|
|
|
||
|
|
if( patternFound )
|
||
|
|
{
|
||
|
|
occurs = true;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
result = qp->ResultStorage(s);
|
||
|
|
//query processor has provided
|
||
|
|
//a CcBool instance for the result
|
||
|
|
|
||
|
|
CcBool* b = static_cast<CcBool*> (result.addr);
|
||
|
|
|
||
|
|
b->Set(defined, occurs); //the first argument says the boolean
|
||
|
|
//value is defined, the second is the
|
||
|
|
//real boolean value)
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.2.3 ~patternPositionsFunction~
|
||
|
|
|
||
|
|
Find the positions for a given search pattern in our suffixtree
|
||
|
|
|
||
|
|
1. Search the given pattern in our SuffixTree.
|
||
|
|
2. Every vertex below this position is a position of our pattern
|
||
|
|
|
||
|
|
*/
|
||
|
|
int patternPositionsFunction(Word* args, Word& result, int message,
|
||
|
|
Word& local, Supplier s)
|
||
|
|
{
|
||
|
|
|
||
|
|
switch( message )
|
||
|
|
|
||
|
|
{
|
||
|
|
case OPEN: {
|
||
|
|
list<int> *positions = new list<int> ();
|
||
|
|
|
||
|
|
map<SuffixTreeVertex*, int> vertexDepthMap;
|
||
|
|
|
||
|
|
SuffixTree* p = static_cast<SuffixTree*>( args[0].addr );
|
||
|
|
FText *text = static_cast<FText*>( args[1].addr );
|
||
|
|
string searchPattern = text -> GetValue();
|
||
|
|
|
||
|
|
if( p->IsDefined() && text->IsDefined() )
|
||
|
|
{
|
||
|
|
if(searchPattern.length() > 0)
|
||
|
|
{
|
||
|
|
// search position in suffix tree
|
||
|
|
const SuffixTreeEdge *ourEdge = NULL;
|
||
|
|
int offset = -1;
|
||
|
|
|
||
|
|
bool patternFound = p -> GetInMemoryTree() -> FindEdgeForSearchPattern
|
||
|
|
(searchPattern, &ourEdge, &offset);
|
||
|
|
|
||
|
|
if( patternFound )
|
||
|
|
{
|
||
|
|
// Find all edges below our position
|
||
|
|
queue<SuffixTreeVertex*> vertices;
|
||
|
|
|
||
|
|
// Is searchPattern not unique?
|
||
|
|
if( ourEdge -> HasVertex() )
|
||
|
|
{
|
||
|
|
// Calculate depth for ChildVertex
|
||
|
|
int vertexDepth = searchPattern.length()
|
||
|
|
+ (ourEdge -> GetLength() - offset);
|
||
|
|
|
||
|
|
vertices.push( ourEdge -> GetChild() );
|
||
|
|
|
||
|
|
// Save vertex depth
|
||
|
|
vertexDepthMap[ourEdge -> GetChild()] = vertexDepth;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
// Add our edge to result
|
||
|
|
positions -> push_front(ourEdge -> GetStartIndex()
|
||
|
|
+ offset - searchPattern.length());
|
||
|
|
}
|
||
|
|
|
||
|
|
// Process all vertices and caclculate depth of the vertex
|
||
|
|
while(! vertices.empty())
|
||
|
|
{
|
||
|
|
SuffixTreeVertex *ourVertex = vertices.front();
|
||
|
|
vertices.pop();
|
||
|
|
|
||
|
|
// Read depth of our vertex
|
||
|
|
int depthOfOurVertex = vertexDepthMap[ourVertex];
|
||
|
|
|
||
|
|
size_t edges = ourVertex -> GetEdgeCount();
|
||
|
|
|
||
|
|
for(size_t i = 0; i < edges; i++)
|
||
|
|
{
|
||
|
|
SuffixTreeEdge *aEdge = ourVertex -> GetEdgeAt(i);
|
||
|
|
|
||
|
|
// Append position to result set
|
||
|
|
positions -> push_front(aEdge -> GetStartIndex()
|
||
|
|
- depthOfOurVertex);
|
||
|
|
|
||
|
|
if(aEdge -> HasVertex())
|
||
|
|
{
|
||
|
|
// Save depth for child vertices
|
||
|
|
vertexDepthMap[ aEdge -> GetChild() ]
|
||
|
|
= depthOfOurVertex + aEdge -> GetLength();
|
||
|
|
|
||
|
|
vertices.push( aEdge -> GetChild() );
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
positions -> sort();
|
||
|
|
positions -> unique();
|
||
|
|
|
||
|
|
local.setAddr(positions);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
case REQUEST: {
|
||
|
|
|
||
|
|
list<int> *positions = static_cast< list<int>* >(local.addr);
|
||
|
|
|
||
|
|
// Are elements in our list?
|
||
|
|
if(! positions->empty())
|
||
|
|
{
|
||
|
|
// Return next position
|
||
|
|
int firstValue = positions -> front();
|
||
|
|
positions -> pop_front();
|
||
|
|
|
||
|
|
CcInt* elem = new CcInt(true, firstValue);
|
||
|
|
result.addr = elem;
|
||
|
|
return YIELD;
|
||
|
|
}
|
||
|
|
|
||
|
|
// End of list reached
|
||
|
|
result.addr = 0;
|
||
|
|
return CANCEL;
|
||
|
|
}
|
||
|
|
|
||
|
|
case CLOSE: {
|
||
|
|
|
||
|
|
// Clean up
|
||
|
|
if(local.addr != NULL)
|
||
|
|
{
|
||
|
|
list<int> *positions = static_cast< list<int>* >(local.addr);
|
||
|
|
delete positions;
|
||
|
|
local.setAddr(NULL);
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
default: {
|
||
|
|
/* should not happen */
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.2.4 ~patternCountFunction~
|
||
|
|
|
||
|
|
Count the appearing of a given pattern in our Suffixtree:
|
||
|
|
|
||
|
|
* search this pattern in our suffixtree
|
||
|
|
* count all leafes below this position. Every leaf is a
|
||
|
|
appearing
|
||
|
|
|
||
|
|
*/
|
||
|
|
int patternCountFunction(Word* args, Word& result, int message, Word& local,
|
||
|
|
Supplier s)
|
||
|
|
{
|
||
|
|
|
||
|
|
bool defined = true;
|
||
|
|
int count = 0;
|
||
|
|
|
||
|
|
SuffixTree* p = static_cast<SuffixTree*> (args[0].addr);
|
||
|
|
FText *text = static_cast<FText*> (args[1].addr);
|
||
|
|
string searchPattern = text -> GetValue();
|
||
|
|
|
||
|
|
if (!p->IsDefined() || !text->IsDefined())
|
||
|
|
{
|
||
|
|
defined = false;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
if (searchPattern.length() > 0)
|
||
|
|
{
|
||
|
|
// search position in suffix tree
|
||
|
|
const SuffixTreeEdge *ourEdge = NULL;
|
||
|
|
int offset = -1;
|
||
|
|
|
||
|
|
// Count leaves
|
||
|
|
bool patternFound = p -> GetInMemoryTree() -> FindEdgeForSearchPattern(
|
||
|
|
searchPattern, &ourEdge, &offset);
|
||
|
|
|
||
|
|
if (patternFound)
|
||
|
|
{
|
||
|
|
// Result is unique
|
||
|
|
if (!ourEdge -> HasVertex())
|
||
|
|
{
|
||
|
|
count = 1;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
count = (ourEdge -> GetChild()) -> GetNumberOfLeaves();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Write result
|
||
|
|
result = qp->ResultStorage(s);
|
||
|
|
CcInt* res = static_cast<CcInt*> (result.addr);
|
||
|
|
|
||
|
|
res->Set(defined, count);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
This function searches for the longest repeated substrings in vertex and
|
||
|
|
write it in curStringList.
|
||
|
|
|
||
|
|
*/
|
||
|
|
void SearchLongestRepeatedSubstring(SuffixTreeVertex* vertex,
|
||
|
|
list<string>* curStringList, string curString)
|
||
|
|
{
|
||
|
|
vertex -> TriggerLoadNextData();
|
||
|
|
for (size_t edgeNo = 0; edgeNo < vertex->GetEdgeCount(); edgeNo++)
|
||
|
|
{
|
||
|
|
SuffixTreeEdge* edge = vertex->GetEdgeAt(edgeNo);
|
||
|
|
if (edge->HasVertex())
|
||
|
|
{
|
||
|
|
edge ->GetChild() -> TriggerLoadNextData();
|
||
|
|
// there is another node which lies lower in the tree
|
||
|
|
SearchLongestRepeatedSubstring(
|
||
|
|
edge->GetChild(),
|
||
|
|
curStringList,
|
||
|
|
curString + vertex->GetInput()->substr(edge->GetStartIndex(),
|
||
|
|
edge->GetLength()));
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
// vertex is relating to edge the 'final' inner node before the leaf node
|
||
|
|
if (curString.length() == curStringList->front().length())
|
||
|
|
{
|
||
|
|
// curStringVector contains a string of the length curString.length().
|
||
|
|
// If curString is not yet included in the vector, it will be added
|
||
|
|
// to the vector.
|
||
|
|
if (curStringList->back()!=curString)
|
||
|
|
{
|
||
|
|
curStringList->push_back(curString);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (curString.length() > curStringList->front().length())
|
||
|
|
{
|
||
|
|
// curString is a new 'longest repeated substring'. The previously
|
||
|
|
// identified shorter repeated substrings will be deleted.
|
||
|
|
curStringList -> clear();
|
||
|
|
curStringList->push_front(curString);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.2.5.1 ~longestRepeatedSubstringFunction~
|
||
|
|
The function returns a stream of the longest repeated substrings.
|
||
|
|
They will be find by searching the deepest inner node before the leaf
|
||
|
|
with the greatest distance from the root.
|
||
|
|
|
||
|
|
*/
|
||
|
|
int longestRepeatedSubstringFunction(Word* args, Word& result, int message,
|
||
|
|
Word& local, Supplier s)
|
||
|
|
{
|
||
|
|
list<string>* repeatedStringList = static_cast<list<string>*> (local.addr);
|
||
|
|
|
||
|
|
switch (message)
|
||
|
|
{
|
||
|
|
case OPEN:
|
||
|
|
{ // initialize the local storage
|
||
|
|
SuffixTree* suffixtree = static_cast<SuffixTree*> (args[0].addr);
|
||
|
|
list<string>* curStringList = new list<string> ();
|
||
|
|
curStringList->push_front("");
|
||
|
|
string curString = "";
|
||
|
|
SearchLongestRepeatedSubstring(suffixtree -> GetInMemoryTree(),
|
||
|
|
curStringList, curString);
|
||
|
|
|
||
|
|
local.addr = curStringList;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
case REQUEST:
|
||
|
|
{ // return the next stream element
|
||
|
|
|
||
|
|
if (!repeatedStringList->empty() && repeatedStringList->front().length()
|
||
|
|
> 0)
|
||
|
|
{
|
||
|
|
FText* elem = new FText(true, repeatedStringList->front());
|
||
|
|
repeatedStringList->pop_front();
|
||
|
|
result.addr = elem;
|
||
|
|
return YIELD;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
result.addr = 0;
|
||
|
|
return CANCEL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
case CLOSE:
|
||
|
|
{ // free the local storage
|
||
|
|
|
||
|
|
if (repeatedStringList != 0)
|
||
|
|
{
|
||
|
|
delete repeatedStringList;
|
||
|
|
local.addr = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
default:
|
||
|
|
{
|
||
|
|
/* should never happen */
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
void SearchShortestUniqueSubstring(SuffixTreeVertex* vertex,
|
||
|
|
list<string>* curStringList, string curString)
|
||
|
|
{
|
||
|
|
for (size_t edgeNo = 0; edgeNo < vertex->GetEdgeCount(); edgeNo++)
|
||
|
|
{
|
||
|
|
SuffixTreeEdge* edge = vertex->GetEdgeAt(edgeNo);
|
||
|
|
if (edge->HasVertex())
|
||
|
|
{
|
||
|
|
// There is another node which lies lower in the tree.
|
||
|
|
SearchShortestUniqueSubstring(
|
||
|
|
edge->GetChild(),
|
||
|
|
curStringList,
|
||
|
|
curString + vertex->GetInput()->substr(edge->GetStartIndex(),
|
||
|
|
edge->GetLength()));
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
// vertex is relating to edge, the 'final' inner node before the
|
||
|
|
// leaf node
|
||
|
|
if (edge->GetStartIndex() < vertex->GetInput()->length() )
|
||
|
|
{
|
||
|
|
if((vertex->GetInput()->at(edge->GetStartIndex())) != '\0')
|
||
|
|
{
|
||
|
|
// curString is the prefix of the shortest unique substring, since
|
||
|
|
// the edge is labeled not only with the index for the guide.
|
||
|
|
string newUniqueString = curString + vertex->GetInput()->at(
|
||
|
|
edge->GetStartIndex());
|
||
|
|
|
||
|
|
if (newUniqueString.length() == curStringList->front().length())
|
||
|
|
{
|
||
|
|
if (curStringList->back()!=newUniqueString){
|
||
|
|
curStringList->push_back(newUniqueString);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (newUniqueString.length() < curStringList->front().length())
|
||
|
|
{
|
||
|
|
curStringList-> clear();
|
||
|
|
curStringList->push_front(newUniqueString);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.2.6 ~shortestUniqueSubstringFunction~
|
||
|
|
The function returns a stream of the shortes unique substrings.
|
||
|
|
They will be find by looking for the last inner node v before a leaf
|
||
|
|
fullfilling two conditions:
|
||
|
|
a) v has minimal stringlength
|
||
|
|
b) the outgoing edge is not only labled with the termination character.
|
||
|
|
The string spelled until v concatenated with the first character on the
|
||
|
|
outgoing edge is a unique substring.
|
||
|
|
|
||
|
|
*/
|
||
|
|
int shortestUniqueSubstringFunction(Word* args, Word& result, int message,
|
||
|
|
Word& local, Supplier s)
|
||
|
|
{
|
||
|
|
list<string>* repeatedStringList = static_cast<list<string>*> (local.addr);
|
||
|
|
|
||
|
|
switch (message)
|
||
|
|
{
|
||
|
|
case OPEN:
|
||
|
|
{ // initialize the local storage
|
||
|
|
SuffixTree* suffixtree = static_cast<SuffixTree*> (args[0].addr);
|
||
|
|
list<string>* curStringList = new list<string> ();
|
||
|
|
|
||
|
|
string data = *(suffixtree -> GetInMemoryTree() ->GetInput());
|
||
|
|
|
||
|
|
if(data[0] != '\0')
|
||
|
|
{
|
||
|
|
curStringList->push_front(data);
|
||
|
|
string curString = "";
|
||
|
|
|
||
|
|
SearchShortestUniqueSubstring(suffixtree -> GetInMemoryTree(),
|
||
|
|
curStringList, curString);
|
||
|
|
}
|
||
|
|
|
||
|
|
local.addr = curStringList;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
case REQUEST:
|
||
|
|
{ // return the next stream element
|
||
|
|
|
||
|
|
if (!repeatedStringList->empty() && repeatedStringList->front().length()
|
||
|
|
> 0)
|
||
|
|
{
|
||
|
|
FText* elem = new FText(true, repeatedStringList->front());
|
||
|
|
repeatedStringList->pop_front();
|
||
|
|
result.addr = elem;
|
||
|
|
return YIELD;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
result.addr = 0;
|
||
|
|
return CANCEL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
case CLOSE:
|
||
|
|
{ // free the local storage
|
||
|
|
|
||
|
|
if (repeatedStringList != 0)
|
||
|
|
{
|
||
|
|
delete repeatedStringList;
|
||
|
|
local.addr = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
default:
|
||
|
|
{
|
||
|
|
/* should never happen */
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.2.7 ~longestCommonSubstringFunction~
|
||
|
|
This function delivers the longest common substring of two strings.
|
||
|
|
Therefore it first construct a generalized suffix tree with additional
|
||
|
|
informations in the vertices. The additional informations are the current
|
||
|
|
string depth and whether the the node covers suffixes of string one or two
|
||
|
|
or both function AlterToGeneralizedST.
|
||
|
|
Then the algorithm search the node with the greatest string depth that is
|
||
|
|
marked with the information that it covers suffixes from both strings.
|
||
|
|
|
||
|
|
*/
|
||
|
|
void AlterToGeneralizedST(SuffixTreeVertex *st,size_t endS1)
|
||
|
|
{
|
||
|
|
for (size_t edge=0; edge<=st->GetEdgeCount()-1; edge++)
|
||
|
|
{
|
||
|
|
|
||
|
|
SuffixTreeEdge *e = st->GetEdgeAt(edge);
|
||
|
|
size_t ESI=e->GetStartIndex();
|
||
|
|
size_t EEI=e->GetEndIndex();
|
||
|
|
|
||
|
|
if (e->GetChild() != NULL)
|
||
|
|
{
|
||
|
|
SuffixTreeVertex *child = e->GetChild();
|
||
|
|
child->SetSDepth(st->GetSDepth()+e->GetLength());
|
||
|
|
|
||
|
|
|
||
|
|
AlterToGeneralizedST( child, endS1 );
|
||
|
|
|
||
|
|
SuffixTreeVertex *parent = e->GetParent();
|
||
|
|
parent->SetCov1(child->GetCov1() || parent->GetCov1());
|
||
|
|
parent->SetCov2(child->GetCov2() || parent->GetCov2());
|
||
|
|
}
|
||
|
|
else //edge to a leaf
|
||
|
|
{
|
||
|
|
// reduce 2nd edge index on leaf edges
|
||
|
|
if (ESI<=endS1 && EEI>endS1) e->SetEndIndex(endS1);
|
||
|
|
if (e->GetEndIndex()<=endS1) e->GetParent()->SetCov1(true);
|
||
|
|
if (e->GetEndIndex()>endS1) e->GetParent()->SetCov2(true);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
void lcs(SuffixTreeVertex *st,queue<string>*LCSqueue,string curPathLabel="")
|
||
|
|
{
|
||
|
|
for (size_t edge=0; edge<=st->GetEdgeCount()-1; edge++)
|
||
|
|
{
|
||
|
|
SuffixTreeEdge *e = st->GetEdgeAt(edge);
|
||
|
|
if (e->GetChild()!=NULL && e->GetChild()->GetCov1()==true &&
|
||
|
|
e->GetChild()->GetCov2()==true)
|
||
|
|
lcs(e->GetChild(), LCSqueue,
|
||
|
|
curPathLabel+
|
||
|
|
st->GetInput()->substr(e->GetStartIndex(),e->GetLength()));
|
||
|
|
else
|
||
|
|
{
|
||
|
|
string last = "";
|
||
|
|
if (!LCSqueue->empty())
|
||
|
|
last = LCSqueue->back();
|
||
|
|
|
||
|
|
if (last.length()<st->GetSDepth())
|
||
|
|
{
|
||
|
|
while (!LCSqueue->empty()) LCSqueue->pop();
|
||
|
|
LCSqueue->push(curPathLabel);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (last.length() == st->GetSDepth() && last!=curPathLabel)
|
||
|
|
LCSqueue->push(curPathLabel);
|
||
|
|
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
int longestCommonSubstringFunction(Word* args, Word& result, int message,
|
||
|
|
Word& local, Supplier s)
|
||
|
|
{
|
||
|
|
queue<string> *LCSqueue = static_cast<queue<string>*> (local.addr);
|
||
|
|
|
||
|
|
switch (message)
|
||
|
|
{
|
||
|
|
case OPEN:
|
||
|
|
{
|
||
|
|
FText *t1 = static_cast<FText*> (args[0].addr);
|
||
|
|
FText *t2 = static_cast<FText*> (args[1].addr);
|
||
|
|
queue<string> *q = new queue<string> ();
|
||
|
|
|
||
|
|
if (t1->IsDefined() && t2->IsDefined())
|
||
|
|
{
|
||
|
|
string s1 = t1->GetValue();
|
||
|
|
string s2 = t2->GetValue();
|
||
|
|
string mergedText = s1+textSeparator+s2+terminationCharacter;
|
||
|
|
|
||
|
|
SuffixTreeVertex *root =
|
||
|
|
SimpleTreeBuilder::CreateSuffixTree(new string(mergedText));
|
||
|
|
AlterToGeneralizedST(root,s1.length());
|
||
|
|
|
||
|
|
lcs (root, q);
|
||
|
|
delete root;
|
||
|
|
}
|
||
|
|
local.addr = q;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
case REQUEST:
|
||
|
|
{
|
||
|
|
// return next stream elemt
|
||
|
|
if (!LCSqueue->empty())
|
||
|
|
{
|
||
|
|
FText *elem = new FText(true, LCSqueue->front());
|
||
|
|
LCSqueue->pop();
|
||
|
|
result.addr = elem;
|
||
|
|
return YIELD;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
result.addr = 0;
|
||
|
|
return CANCEL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
case CLOSE:
|
||
|
|
{
|
||
|
|
if (LCSqueue != NULL)
|
||
|
|
{
|
||
|
|
delete LCSqueue;
|
||
|
|
LCSqueue = NULL;
|
||
|
|
local.addr = 0;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
default:
|
||
|
|
{
|
||
|
|
// should never happen
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
void SearchMaximalUniqueMatches(SuffixTreeVertex* vertex,
|
||
|
|
vector<string>* curStringVector, string curString, size_t posSeparator)
|
||
|
|
{
|
||
|
|
if (vertex->GetEdgeCount()==2 && !vertex->GetEdgeAt(0)->HasVertex()
|
||
|
|
&& !vertex->GetEdgeAt(1)->HasVertex() && curString.length()>0)
|
||
|
|
{
|
||
|
|
// inner node
|
||
|
|
SuffixTreeEdge* edge1 = vertex->GetEdgeAt(0);
|
||
|
|
SuffixTreeEdge* edge2 = vertex->GetEdgeAt(1);
|
||
|
|
bool charExistsLeftText = true;
|
||
|
|
bool charExistsRightText = true;
|
||
|
|
size_t index1 = 0; // index of the char before curString in the left Text
|
||
|
|
size_t index2 = 0; // index of the char before curString in the right text
|
||
|
|
// If exact one of the two startIndexes of the two edges is lower than
|
||
|
|
// the index of the separator, we have a MUM and curString may not
|
||
|
|
// be extended at the right end.
|
||
|
|
if (edge1->GetStartIndex()<=posSeparator &&
|
||
|
|
edge2->GetStartIndex()>posSeparator){
|
||
|
|
// If there is another char left of curString in the left text and a
|
||
|
|
// char left of curString in the right text, we compare these chars. If
|
||
|
|
// they are different we have another MUM. If there is no other char
|
||
|
|
// left of curString in the right text or in the left text, curString is
|
||
|
|
// a MUM too.
|
||
|
|
if (edge1->GetStartIndex()>curString.length()){
|
||
|
|
index1 = edge1->GetStartIndex() - curString.length() -1;
|
||
|
|
} else {
|
||
|
|
charExistsLeftText = false;
|
||
|
|
}
|
||
|
|
if (( edge2->GetStartIndex() - curString.length() - 1 ) > posSeparator){
|
||
|
|
index2 = edge2->GetStartIndex() - curString.length() -1;
|
||
|
|
} else {
|
||
|
|
charExistsRightText = false;
|
||
|
|
}
|
||
|
|
if (charExistsLeftText && charExistsRightText){
|
||
|
|
if ( vertex->GetInput()->at(index1) != vertex->GetInput()->at(index2)){
|
||
|
|
curStringVector->push_back(curString);
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
curStringVector->push_back(curString);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (edge2->GetStartIndex()<=posSeparator &&
|
||
|
|
edge1->GetStartIndex()>posSeparator){
|
||
|
|
// see the last comment
|
||
|
|
if (edge2->GetStartIndex()>curString.length()){
|
||
|
|
index2 = edge2->GetStartIndex() - curString.length() -1;
|
||
|
|
} else {
|
||
|
|
charExistsLeftText = false;
|
||
|
|
}
|
||
|
|
if (( edge1->GetStartIndex() - curString.length() - 1 ) > posSeparator){
|
||
|
|
index1 = edge1->GetStartIndex() - curString.length() -1;
|
||
|
|
} else {
|
||
|
|
charExistsRightText = false;
|
||
|
|
}
|
||
|
|
if (charExistsLeftText && charExistsRightText){
|
||
|
|
if ( vertex->GetInput()->at(index1) != vertex->GetInput()->at(index2)){
|
||
|
|
curStringVector->push_back(curString);
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
curStringVector->push_back(curString);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
for (size_t edgeNo = 0; edgeNo < vertex->GetEdgeCount(); edgeNo++)
|
||
|
|
{
|
||
|
|
SuffixTreeEdge* edge = vertex->GetEdgeAt(edgeNo);
|
||
|
|
if (edge->HasVertex())
|
||
|
|
{
|
||
|
|
// There is another node which lies lower in the tree.
|
||
|
|
SearchMaximalUniqueMatches(
|
||
|
|
edge->GetChild(),
|
||
|
|
curStringVector,
|
||
|
|
curString + vertex->GetInput()->substr(edge->GetStartIndex(),
|
||
|
|
edge->GetLength()),
|
||
|
|
posSeparator);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.2.8 ~maximalUniqueMatchesFunction~
|
||
|
|
The function returns a stream of the substring of 2 texts which occur in each
|
||
|
|
text exactly once and have maximal length.
|
||
|
|
They will be find by searching the deepest inner node with exactly 2 edges.
|
||
|
|
One of the edges has to be a leaf of the first text and the other has to be a
|
||
|
|
leaf of the second text. Thats ensure the maximality on the right side of
|
||
|
|
the substring. The maximality on the left side will be proofed by comparing
|
||
|
|
the character before the substring.
|
||
|
|
|
||
|
|
*/
|
||
|
|
int maximalUniqueMatchesFunction(Word* args, Word& result, int message,
|
||
|
|
Word& local, Supplier s)
|
||
|
|
{
|
||
|
|
vector<string>* MUMVector = static_cast<vector<string>*> (local.addr);
|
||
|
|
|
||
|
|
switch (message)
|
||
|
|
{
|
||
|
|
case OPEN:
|
||
|
|
{ // initialize the local storage
|
||
|
|
FText *ftext1 = static_cast<FText*> (args[0].addr);
|
||
|
|
string firstText = ftext1 -> GetValue();
|
||
|
|
|
||
|
|
FText *ftext2 = static_cast<FText*> (args[1].addr);
|
||
|
|
string secondText = ftext2 -> GetValue();
|
||
|
|
|
||
|
|
vector<string>* curStringVector = new vector<string> ();
|
||
|
|
|
||
|
|
if(ftext1 -> IsDefined() && ftext2 -> IsDefined())
|
||
|
|
{
|
||
|
|
string mergedText = firstText + textSeparator + secondText
|
||
|
|
+ terminationCharacter;
|
||
|
|
|
||
|
|
SuffixTreeVertex* root =
|
||
|
|
UkkonenTreeBuilder::CreateSuffixTree(new string(mergedText));
|
||
|
|
|
||
|
|
string curString = "";
|
||
|
|
SearchMaximalUniqueMatches(root, curStringVector, curString,
|
||
|
|
firstText.length());
|
||
|
|
|
||
|
|
delete root;
|
||
|
|
}
|
||
|
|
|
||
|
|
local.addr = curStringVector;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
case REQUEST:
|
||
|
|
{ // return the next stream element
|
||
|
|
|
||
|
|
if (!MUMVector->empty())
|
||
|
|
{
|
||
|
|
FText* elem = new FText(true, MUMVector->front());
|
||
|
|
MUMVector->erase(MUMVector->begin());
|
||
|
|
result.addr = elem;
|
||
|
|
return YIELD;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
result.addr = 0;
|
||
|
|
return CANCEL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
case CLOSE:
|
||
|
|
{ // free the local storage
|
||
|
|
|
||
|
|
if (MUMVector != NULL)
|
||
|
|
{
|
||
|
|
delete MUMVector;
|
||
|
|
local.addr = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
default:
|
||
|
|
{
|
||
|
|
/* should never happen */
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.2.9 ~circularStringLinearisationFunction~
|
||
|
|
This function return a stream of tuple with the lexically smallest linear
|
||
|
|
string representation of a circular string with the position(s) to cut the
|
||
|
|
circular string.
|
||
|
|
Given a string T of length n, it constructs a suffix tree off SS and
|
||
|
|
then traverse the tree with the rule that at every node the traversal
|
||
|
|
follows the edge whose first character is the lexically smallest over all
|
||
|
|
first characters(except the termination character) until the path has at
|
||
|
|
least a string depth of n. Any leaf in the subtree at that point
|
||
|
|
delivers a cut-position.
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
|
||
|
|
void FindLeaves (SuffixTreeEdge *e, queue<int> *LeafQueue, int aktSDepth,
|
||
|
|
int lengthString)
|
||
|
|
{
|
||
|
|
|
||
|
|
if (e->GetChild()==NULL) // start position of suffix is cut position
|
||
|
|
{
|
||
|
|
if (lengthString-aktSDepth!=(lengthString-1)/2)
|
||
|
|
LeafQueue->push(lengthString-aktSDepth);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
SuffixTreeVertex *st = e->GetChild();
|
||
|
|
for (size_t edge=0; edge<=st->GetEdgeCount()-1; edge++)
|
||
|
|
{
|
||
|
|
e=st->GetEdgeAt(edge);
|
||
|
|
FindLeaves(e, LeafQueue, aktSDepth+e->GetLength(), lengthString);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
void CSL(SuffixTreeVertex *st, queue<int> *pos, string curPathLabel="",
|
||
|
|
size_t aktSDepth=0)
|
||
|
|
{
|
||
|
|
size_t lengthString = st->GetInput()->length();
|
||
|
|
SuffixTreeEdge *e=NULL;
|
||
|
|
|
||
|
|
// traverse the tree until string-depth n is achieved
|
||
|
|
|
||
|
|
while (aktSDepth<(lengthString-1) / 2)
|
||
|
|
{
|
||
|
|
e=st->GetEdgeAt(0);
|
||
|
|
if (e->GetStartIndex()==lengthString-1)
|
||
|
|
{
|
||
|
|
e=st->GetEdgeAt(1);
|
||
|
|
}
|
||
|
|
aktSDepth+=e->GetLength();
|
||
|
|
curPathLabel = curPathLabel+st->GetInput()->substr(e->GetStartIndex(),
|
||
|
|
e->GetLength());
|
||
|
|
if (e->GetChild()!=NULL && aktSDepth<lengthString)
|
||
|
|
{
|
||
|
|
st=e->GetChild();
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
// string-depth n is achieved find all leaves
|
||
|
|
FindLeaves(e, pos, aktSDepth,lengthString);
|
||
|
|
}
|
||
|
|
|
||
|
|
int circularStringLinearizationFunction(Word* args, Word& result, int message,
|
||
|
|
Word& local, Supplier s)
|
||
|
|
{
|
||
|
|
switch (message)
|
||
|
|
{
|
||
|
|
case OPEN:
|
||
|
|
{
|
||
|
|
|
||
|
|
ListExpr resultType = GetTupleResultType(s);
|
||
|
|
TupleType *resultTupleType = new TupleType(nl->Second(resultType));
|
||
|
|
|
||
|
|
FText *t = static_cast<FText*> (args[0].addr);
|
||
|
|
queue<int> *q = new queue<int> ();
|
||
|
|
|
||
|
|
|
||
|
|
if (t->IsDefined())
|
||
|
|
{
|
||
|
|
string s = t->GetValue();
|
||
|
|
string mergedText = s+s+terminationCharacter;
|
||
|
|
SuffixTreeVertex *root =
|
||
|
|
UkkonenTreeBuilder::CreateSuffixTree(new string(mergedText));
|
||
|
|
if (mergedText.length()>1)
|
||
|
|
{
|
||
|
|
CSL(root,q);
|
||
|
|
}
|
||
|
|
delete root;
|
||
|
|
}
|
||
|
|
// store local data
|
||
|
|
CslLocalData *cslLocalData = new CslLocalData();
|
||
|
|
cslLocalData->posQueue = q;
|
||
|
|
cslLocalData->tupleType = resultTupleType;
|
||
|
|
|
||
|
|
local.addr = cslLocalData;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
case REQUEST:
|
||
|
|
{
|
||
|
|
CslLocalData *cslLocalData = static_cast<CslLocalData*> (local.addr);
|
||
|
|
queue<int> *posQueue = cslLocalData->posQueue;
|
||
|
|
TupleType *resultTupleType = cslLocalData->tupleType;
|
||
|
|
|
||
|
|
if (!posQueue->empty())
|
||
|
|
{
|
||
|
|
size_t nextElem = posQueue->front(); posQueue->pop();
|
||
|
|
CcInt *elem = new CcInt(true,nextElem);
|
||
|
|
|
||
|
|
string erg=static_cast<FText*> (args[0].addr)->GetValue();
|
||
|
|
if (nextElem>0 && nextElem<erg.length())
|
||
|
|
{
|
||
|
|
erg=erg.substr(nextElem)+erg.substr(0,nextElem);
|
||
|
|
}
|
||
|
|
FText *linstr= new FText(true,erg);
|
||
|
|
|
||
|
|
|
||
|
|
Tuple *newTuple = new Tuple (resultTupleType);
|
||
|
|
newTuple->PutAttribute(0,linstr);
|
||
|
|
newTuple->PutAttribute(1,elem);
|
||
|
|
|
||
|
|
result.setAddr(newTuple);
|
||
|
|
return YIELD;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
result.addr = 0;
|
||
|
|
return CANCEL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
case CLOSE:
|
||
|
|
{
|
||
|
|
if (local.addr != NULL)
|
||
|
|
{
|
||
|
|
CslLocalData *cslLocalData = static_cast<CslLocalData*> (local.addr);
|
||
|
|
queue<int> *posQueue = cslLocalData->posQueue;
|
||
|
|
TupleType *resultTupleType = cslLocalData->tupleType;
|
||
|
|
|
||
|
|
delete posQueue;
|
||
|
|
resultTupleType->DeleteIfAllowed();
|
||
|
|
delete cslLocalData;
|
||
|
|
|
||
|
|
local.addr = 0;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
default:
|
||
|
|
{
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
Creates a list of all patterns |p| of a text s,
|
||
|
|
which have at most k mismatches.
|
||
|
|
|
||
|
|
*/
|
||
|
|
void createkmismatchList(SuffixTreeVertex* vertex, list<string>* curStringList,
|
||
|
|
int numReadChars, int numAllowedMismatches, int curFoundMismatches,
|
||
|
|
string curString, string pattern)
|
||
|
|
{
|
||
|
|
|
||
|
|
//number of edges from this vertex
|
||
|
|
size_t numEdgesOfVertex = vertex->GetEdgeCount();
|
||
|
|
|
||
|
|
//for all suffixtreeEdges in suffixtreeVertex
|
||
|
|
for (size_t curNumEdges = 0; curNumEdges < numEdgesOfVertex; curNumEdges++)
|
||
|
|
{
|
||
|
|
//SuffixtreeEdge on position curNumEdge
|
||
|
|
SuffixTreeEdge* edge = vertex->GetEdgeAt(curNumEdges, false);
|
||
|
|
|
||
|
|
//actually found mismatches until now
|
||
|
|
int newFoundMismatches = curFoundMismatches;
|
||
|
|
|
||
|
|
//number characters on edge
|
||
|
|
size_t numCharsOnEdge = edge->GetLength();
|
||
|
|
|
||
|
|
string currentStringOnEdge = string(curString);
|
||
|
|
int readCharsTotal = numReadChars;
|
||
|
|
|
||
|
|
//for characters on edge
|
||
|
|
for(size_t curCharNumOnEdge = 0; curCharNumOnEdge<= numCharsOnEdge;
|
||
|
|
curCharNumOnEdge++)
|
||
|
|
{
|
||
|
|
|
||
|
|
//no more chars on edge start recursion
|
||
|
|
if(curCharNumOnEdge >= edge->GetLength())
|
||
|
|
{
|
||
|
|
if(edge->HasVertex()){
|
||
|
|
createkmismatchList(edge->GetChild(), curStringList, readCharsTotal,
|
||
|
|
numAllowedMismatches, newFoundMismatches,
|
||
|
|
currentStringOnEdge, pattern);
|
||
|
|
}
|
||
|
|
}else{
|
||
|
|
|
||
|
|
size_t offset = edge->GetStartIndex()+curCharNumOnEdge;
|
||
|
|
|
||
|
|
//lese ein zeichen auf kante
|
||
|
|
vertex -> LoadTextForIndex(offset, offset);
|
||
|
|
|
||
|
|
const string *input = vertex -> GetInput();
|
||
|
|
char curChar = (*input)[offset];
|
||
|
|
|
||
|
|
|
||
|
|
if(curChar == '\0') {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if(curChar != pattern[readCharsTotal])
|
||
|
|
{
|
||
|
|
newFoundMismatches++;
|
||
|
|
}
|
||
|
|
|
||
|
|
currentStringOnEdge.push_back(curChar);
|
||
|
|
readCharsTotal++;
|
||
|
|
}
|
||
|
|
|
||
|
|
if(newFoundMismatches > numAllowedMismatches)
|
||
|
|
{
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
if(currentStringOnEdge.length() == pattern.length())
|
||
|
|
{
|
||
|
|
curStringList->push_front(currentStringOnEdge);
|
||
|
|
// curStringList->push_back(currentStringOnEdge);
|
||
|
|
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
}//end for char on edge
|
||
|
|
|
||
|
|
}//end for num edges
|
||
|
|
|
||
|
|
curStringList->unique();
|
||
|
|
//curStringList->reverse();
|
||
|
|
|
||
|
|
}//end function
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.2.10 ~kMismatchFunction~
|
||
|
|
Returns an integer stream containing all patterns with at most k mismatches.
|
||
|
|
|
||
|
|
*/
|
||
|
|
int kMismatchFunction(Word* args, Word& result, int message, Word& local,
|
||
|
|
Supplier s)
|
||
|
|
{
|
||
|
|
|
||
|
|
list<string>* mismatchList = static_cast<list<string>*> (local.addr);
|
||
|
|
|
||
|
|
switch (message)
|
||
|
|
{
|
||
|
|
case OPEN:
|
||
|
|
{
|
||
|
|
//suffixtree
|
||
|
|
SuffixTree* suffixtree = static_cast<SuffixTree*> (args[0].addr);
|
||
|
|
|
||
|
|
//textpattern
|
||
|
|
FText* pattern = static_cast<FText*> (args[1].addr);
|
||
|
|
string patternString = pattern->GetValue();
|
||
|
|
|
||
|
|
//number of allowed mismatches
|
||
|
|
CcInt* numOfMis = static_cast<CcInt*> (args[2].addr);
|
||
|
|
int numOfMisInt = numOfMis->GetValue();
|
||
|
|
size_t num = numOfMisInt;
|
||
|
|
|
||
|
|
//list for output
|
||
|
|
list<string>* curStringList = new list<string> ();
|
||
|
|
curStringList->push_front("");
|
||
|
|
string curString = "";
|
||
|
|
|
||
|
|
//if all parameters are defined call createkmismatchList function
|
||
|
|
//to fill the curStringList
|
||
|
|
if(suffixtree->IsDefined() && pattern->IsDefined() &&
|
||
|
|
numOfMis->IsDefined()){
|
||
|
|
//rootvertex of suffixtree
|
||
|
|
SuffixTreeVertex* root = suffixtree->GetInMemoryTree();
|
||
|
|
|
||
|
|
int numofReadChars = 0;
|
||
|
|
int curFoundMismatches = 0;
|
||
|
|
|
||
|
|
//pattern mustn`t be longer than the text into the suffixtree
|
||
|
|
//and number of mismatches mustn`t be longer than pattern length
|
||
|
|
if((patternString.length() <= root->GetInput()->length())&&
|
||
|
|
(num <= patternString.length())){
|
||
|
|
|
||
|
|
createkmismatchList(root, curStringList, numofReadChars, numOfMisInt,
|
||
|
|
curFoundMismatches, curString, patternString);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
local.addr = curStringList;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
case REQUEST:
|
||
|
|
{ // return the next stream element
|
||
|
|
|
||
|
|
if (!mismatchList->empty() && mismatchList->front().length()
|
||
|
|
> 0)
|
||
|
|
{
|
||
|
|
FText* elem = new FText(true, mismatchList->front());
|
||
|
|
|
||
|
|
//removes first element
|
||
|
|
mismatchList->pop_front();
|
||
|
|
result.addr = elem;
|
||
|
|
|
||
|
|
return YIELD;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
result.addr = 0;
|
||
|
|
return CANCEL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
case CLOSE:
|
||
|
|
{ // free the local storage
|
||
|
|
|
||
|
|
if (mismatchList != 0)
|
||
|
|
{
|
||
|
|
delete mismatchList;
|
||
|
|
local.addr = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
default:
|
||
|
|
{
|
||
|
|
/* should never happen */
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.3 Operator Descriptions
|
||
|
|
|
||
|
|
Similar to the ~property~ function of a type constructor, an operator needs to
|
||
|
|
be described, e.g. for the ~list operators~ command. This is now done by
|
||
|
|
creating a subclass of class ~OperatorInfo~.
|
||
|
|
|
||
|
|
6.3.1 ~createSuffixTreeInfo~ operator info for create function
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct createSuffixTreeInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
createSuffixTreeInfo() :
|
||
|
|
OperatorInfo()
|
||
|
|
{
|
||
|
|
name = "createsuffixtree";
|
||
|
|
signature = FText::BasicType() + " -> " + SuffixTree::BasicType();
|
||
|
|
syntax = "createsuffixtree (_)";
|
||
|
|
meaning = "Creates a SuffixTree from a text in O(n) time.";
|
||
|
|
}
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.3.1 ~createSuffixTreeQuadraticInfo~ operator info for create function
|
||
|
|
with quadratic running time
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct createSuffixTreeQuadraticInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
createSuffixTreeQuadraticInfo() :
|
||
|
|
OperatorInfo()
|
||
|
|
{
|
||
|
|
name = "createsuffixtree_quadratic";
|
||
|
|
signature = FText::BasicType() + " -> " + SuffixTree::BasicType();
|
||
|
|
syntax = "createsuffixtree_quadratic (_)";
|
||
|
|
meaning = "Creates a SuffixTree from a text with quadratic running time.";
|
||
|
|
}
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.3.2 ~patternOccursInfo~
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct patternOccursInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
patternOccursInfo() :
|
||
|
|
OperatorInfo()
|
||
|
|
{
|
||
|
|
name = "patternoccurs";
|
||
|
|
|
||
|
|
signature = SuffixTree::BasicType() + " x " + FText::BasicType()
|
||
|
|
+ " -> " + CcBool::BasicType();
|
||
|
|
syntax = "_ patternoccurs _";
|
||
|
|
meaning = "Decides if a pattern is into the given text.";
|
||
|
|
}
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.3.3 ~patternPositionInfo~
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct patternPositionsInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
patternPositionsInfo() :
|
||
|
|
OperatorInfo()
|
||
|
|
{
|
||
|
|
name = "patternpositions";
|
||
|
|
signature = SuffixTree::BasicType() + " x " + FText::BasicType()
|
||
|
|
+ " -> stream(int)";
|
||
|
|
syntax = "_ patternpositions _";
|
||
|
|
meaning = "Return the positions of a given pattern";
|
||
|
|
}
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.3.4 ~patternCountInfo~
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct patternCountInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
patternCountInfo() :
|
||
|
|
OperatorInfo()
|
||
|
|
{
|
||
|
|
name = "patterncount";
|
||
|
|
signature = SuffixTree::BasicType() + FText::BasicType() + " -> "
|
||
|
|
+ CcInt::BasicType();
|
||
|
|
syntax = "_ patterncount _";
|
||
|
|
meaning = "Count a the given pattern in our SuffixTree";
|
||
|
|
}
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.3.5 ~longestRepeatedSubstringInfo~
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct longestRepeatedSubstringInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
longestRepeatedSubstringInfo() :
|
||
|
|
OperatorInfo()
|
||
|
|
{
|
||
|
|
name = "longestrepeatedsubstring";
|
||
|
|
signature = SuffixTree::BasicType() + " -> (stream text)";
|
||
|
|
syntax = "longestrepeatedsubstring (_)";
|
||
|
|
meaning = "Returns the longest string which occurs more than once"
|
||
|
|
" into the text.";
|
||
|
|
}
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.3.6 ~shortestUniqueSubstringInfo~
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct shortestUniqueSubstringInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
shortestUniqueSubstringInfo() :
|
||
|
|
OperatorInfo()
|
||
|
|
{
|
||
|
|
name = "shortestuniquesubstring";
|
||
|
|
signature = SuffixTree::BasicType() + " -> stream(text)";
|
||
|
|
syntax = "shortestuniquesubstring _";
|
||
|
|
meaning = "returns the shortest not empty string which occurs"
|
||
|
|
" only once.";
|
||
|
|
}
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.3.7 ~longestCommonSubstringInfo~
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct longestCommonSubstringInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
longestCommonSubstringInfo() :
|
||
|
|
OperatorInfo()
|
||
|
|
{
|
||
|
|
name = "longestcommonsubstring";
|
||
|
|
signature = FText::BasicType() +" x " + FText::BasicType() + " -> "
|
||
|
|
+ "(stream (text)";
|
||
|
|
// stream(text)
|
||
|
|
syntax = "longestcommonsubstring _ _";
|
||
|
|
meaning = "returns the longest string which occurs in both texts.";
|
||
|
|
}
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.3.8 ~maximalUniqueMatchesInfo~
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct maximalUniqueMatchesInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
maximalUniqueMatchesInfo() :
|
||
|
|
OperatorInfo()
|
||
|
|
{
|
||
|
|
name = "maximaluniquematches";
|
||
|
|
signature = FText::BasicType() + FText::BasicType() + " -> stream(text)";
|
||
|
|
syntax = "maximaluniquematches (_, _)";
|
||
|
|
meaning = "returns all max unique matches of two strings.";
|
||
|
|
}
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.3.9 ~circularStringLinearisationInfo~
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct circularStringLinearizationInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
circularStringLinearizationInfo() :
|
||
|
|
OperatorInfo()
|
||
|
|
{
|
||
|
|
name = "circularstringlinearization";
|
||
|
|
signature = FText::BasicType() + " -> "
|
||
|
|
"stream(tuple(linstr: text, pos: int))";
|
||
|
|
syntax = "circularstringlinearization _ ";
|
||
|
|
|
||
|
|
meaning = "returns the lexicographical smallest of all strings and"
|
||
|
|
"their start positions.";
|
||
|
|
}
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.3.10 ~kMismatchInfo~
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct kMismatchInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
kMismatchInfo() :
|
||
|
|
OperatorInfo()
|
||
|
|
{
|
||
|
|
name = "kmismatch";
|
||
|
|
signature = SuffixTree::BasicType() + FText::BasicType()
|
||
|
|
+ CcInt::BasicType() + " -> (stream text)";
|
||
|
|
syntax = "kmismatch (_ _ _)";
|
||
|
|
meaning = "returns all patterns of length p into a text s "
|
||
|
|
"which have at most k errors in comperisson to pattern p .";
|
||
|
|
}
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.4 Additional Operators
|
||
|
|
|
||
|
|
6.4.1 Equals
|
||
|
|
compares two SuffixTrees using the text
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr equalSuffixTreeTypeMap(ListExpr args)
|
||
|
|
{
|
||
|
|
NList type(args);
|
||
|
|
const string errMsg = "Expecting two SuffixTrees";
|
||
|
|
|
||
|
|
// two SuffixTrees
|
||
|
|
if (type == NList(SuffixTree::BasicType(), SuffixTree::BasicType()))
|
||
|
|
{
|
||
|
|
return NList(CcBool::BasicType()).listExpr();
|
||
|
|
}
|
||
|
|
return NList::typeError(errMsg);
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
int equalSuffixTreeFun(Word* args, Word& result, int message, Word& local,
|
||
|
|
Supplier s)
|
||
|
|
{
|
||
|
|
|
||
|
|
SuffixTree* tree1 = static_cast<SuffixTree*> (args[0].addr);
|
||
|
|
SuffixTree* tree2 = static_cast<SuffixTree*> (args[1].addr);
|
||
|
|
|
||
|
|
result = qp->ResultStorage(s);
|
||
|
|
//query processor has provided
|
||
|
|
//a CcBool instance for the result
|
||
|
|
|
||
|
|
CcBool* b = static_cast<CcBool*> (result.addr);
|
||
|
|
|
||
|
|
bool res = tree1->Equal(tree2);
|
||
|
|
|
||
|
|
b->Set(true, res); //the first argument says the boolean
|
||
|
|
//value is defined, the second is the
|
||
|
|
//real boolean value)
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
struct equalSuffixTreeInfo: OperatorInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
equalSuffixTreeInfo()
|
||
|
|
{
|
||
|
|
name = "=";
|
||
|
|
|
||
|
|
signature = SuffixTree::BasicType() + " x " + SuffixTree::BasicType()
|
||
|
|
+ " -> " + CcBool::BasicType();
|
||
|
|
syntax = "_ = _";
|
||
|
|
meaning = "Equal predicate.";
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.5 Operator patternFilter.
|
||
|
|
|
||
|
|
This operator processes a stream of texts and selects
|
||
|
|
such texts contained in a SuffixTree
|
||
|
|
|
||
|
|
6.5.1 Type Mapping
|
||
|
|
|
||
|
|
Signature stream(string) x suffixtree -> stream(string)
|
||
|
|
stream(text) x suffixtree -> stream(text)
|
||
|
|
stream(tuple) x attrName x suffixTree -> stream(tuple)
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
ListExpr patternFilterTM(ListExpr args){
|
||
|
|
|
||
|
|
string err = " stream( {text, string}) x {suffixtree, text, string} or "
|
||
|
|
"stream(tuple) x attrName x {suffixtree, text, string} expected";
|
||
|
|
int len = nl->ListLength(args);
|
||
|
|
if((len!=2) && len!=3){
|
||
|
|
return listutils::typeError(err);
|
||
|
|
}
|
||
|
|
if(len==2){
|
||
|
|
ListExpr first = nl->First(args);
|
||
|
|
if( !Stream<CcString>::checkType(first) &&
|
||
|
|
!Stream<FText>::checkType(first)){
|
||
|
|
return listutils::typeError(err);
|
||
|
|
}
|
||
|
|
if(!SuffixTree::checkType(nl->Second(args))&&
|
||
|
|
!CcString::checkType(nl->Second(args)) &&
|
||
|
|
!FText::checkType(nl->Second(args))){
|
||
|
|
return listutils::typeError(err);
|
||
|
|
}
|
||
|
|
return first;
|
||
|
|
}
|
||
|
|
// len = 3
|
||
|
|
ListExpr tstream = nl->First(args);
|
||
|
|
ListExpr attrName = nl->Second(args);
|
||
|
|
ListExpr sufftree = nl->Third(args);
|
||
|
|
if(!Stream<Tuple>::checkType(tstream) ||
|
||
|
|
!listutils::isSymbol(attrName) ){
|
||
|
|
return listutils::typeError(err);
|
||
|
|
}
|
||
|
|
|
||
|
|
if(!SuffixTree::checkType(sufftree) &&
|
||
|
|
!FText::checkType(sufftree) &&
|
||
|
|
!CcString::checkType(sufftree)){
|
||
|
|
return listutils::typeError(err);
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
string name = nl->SymbolValue(attrName);
|
||
|
|
ListExpr attrList = nl->Second(nl->Second(tstream));
|
||
|
|
ListExpr type;
|
||
|
|
int index = listutils::findAttribute(attrList,name,type);
|
||
|
|
if(index == 0){
|
||
|
|
return listutils::typeError(" attribute " + name + " unknown");
|
||
|
|
}
|
||
|
|
if(!CcString::checkType(type) &&
|
||
|
|
!FText::checkType(type)){
|
||
|
|
return listutils::typeError("Attribute " + name +
|
||
|
|
" not of type string or text");
|
||
|
|
}
|
||
|
|
return nl->ThreeElemList(
|
||
|
|
nl->SymbolAtom(Symbols::APPEND()),
|
||
|
|
nl->OneElemList(nl->IntAtom(index-1)),
|
||
|
|
tstream);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.5.2 LocalInfo
|
||
|
|
|
||
|
|
*/
|
||
|
|
template<class T>
|
||
|
|
class patternFilterInfoAttr{
|
||
|
|
|
||
|
|
public:
|
||
|
|
patternFilterInfoAttr(Word& _stream, SuffixTree* t) :
|
||
|
|
stream(_stream), sufftree(0){
|
||
|
|
stream.open();
|
||
|
|
sufftree = t->LoadFromPersistent();
|
||
|
|
}
|
||
|
|
|
||
|
|
patternFilterInfoAttr(Word& _stream, string t) :
|
||
|
|
stream(_stream), sufftree(0){
|
||
|
|
stream.open();
|
||
|
|
|
||
|
|
DeleteTerminalSymbolFromText(t);
|
||
|
|
t = t + terminationCharacter;
|
||
|
|
|
||
|
|
sufftree = UkkonenTreeBuilder::CreateSuffixTree(new string(t));
|
||
|
|
}
|
||
|
|
|
||
|
|
~patternFilterInfoAttr(){
|
||
|
|
stream.close();
|
||
|
|
delete sufftree;
|
||
|
|
}
|
||
|
|
|
||
|
|
T* next(){
|
||
|
|
T* res;
|
||
|
|
while( (res=stream.request()) !=0){
|
||
|
|
if(!res->IsDefined()){
|
||
|
|
res->DeleteIfAllowed();
|
||
|
|
} else {
|
||
|
|
const SuffixTreeEdge* edge=0;
|
||
|
|
int offset = -1;
|
||
|
|
if(sufftree->FindEdgeForSearchPattern(
|
||
|
|
res->GetValue(),&edge,&offset)){
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
res->DeleteIfAllowed();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
private:
|
||
|
|
Stream<T> stream;
|
||
|
|
SuffixTreeVertex* sufftree;
|
||
|
|
};
|
||
|
|
|
||
|
|
template<class T>
|
||
|
|
class patternFilterInfoTuple{
|
||
|
|
|
||
|
|
public:
|
||
|
|
patternFilterInfoTuple(Word& _stream, SuffixTree* st,
|
||
|
|
int _attrPos):
|
||
|
|
stream(_stream),sufftree(0), attrPos(_attrPos){
|
||
|
|
stream.open();
|
||
|
|
sufftree = st->LoadFromPersistent();
|
||
|
|
}
|
||
|
|
|
||
|
|
patternFilterInfoTuple(Word& _stream, string st,
|
||
|
|
int _attrPos):
|
||
|
|
stream(_stream),sufftree(0), attrPos(_attrPos){
|
||
|
|
stream.open();
|
||
|
|
DeleteTerminalSymbolFromText(st);
|
||
|
|
st = st + terminationCharacter;
|
||
|
|
sufftree = UkkonenTreeBuilder::CreateSuffixTree(new string(st));
|
||
|
|
}
|
||
|
|
|
||
|
|
~patternFilterInfoTuple(){
|
||
|
|
stream.close();
|
||
|
|
delete sufftree;
|
||
|
|
}
|
||
|
|
|
||
|
|
Tuple* next(){
|
||
|
|
Tuple* res;
|
||
|
|
while( (res=stream.request())!=0){
|
||
|
|
T* attr = (T*) res->GetAttribute(attrPos);
|
||
|
|
if(!attr->IsDefined()){
|
||
|
|
res->DeleteIfAllowed();
|
||
|
|
} else {
|
||
|
|
const SuffixTreeEdge* edge=0;
|
||
|
|
int offset = -1;
|
||
|
|
if(sufftree->FindEdgeForSearchPattern(attr->GetValue(),
|
||
|
|
&edge,&offset)){
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
res->DeleteIfAllowed();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
private:
|
||
|
|
Stream<Tuple> stream;
|
||
|
|
SuffixTreeVertex* sufftree;
|
||
|
|
int attrPos;
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
6.5.3 Value Mapping
|
||
|
|
|
||
|
|
*/
|
||
|
|
string getValue(CcString* st){
|
||
|
|
return st->GetValue();
|
||
|
|
}
|
||
|
|
|
||
|
|
string getValue(FText* st){
|
||
|
|
return st->GetValue();
|
||
|
|
}
|
||
|
|
|
||
|
|
SuffixTree* getValue(SuffixTree* st){
|
||
|
|
return st;
|
||
|
|
}
|
||
|
|
|
||
|
|
template<class T, class TreeRep>
|
||
|
|
int patternFilterVM1(Word* args, Word& result, int message, Word& local,
|
||
|
|
Supplier s){
|
||
|
|
|
||
|
|
|
||
|
|
patternFilterInfoAttr<T>* li = (patternFilterInfoAttr<T>*) local.addr;
|
||
|
|
|
||
|
|
switch(message){
|
||
|
|
case OPEN: {
|
||
|
|
if(li){
|
||
|
|
delete li;
|
||
|
|
local.addr = 0;
|
||
|
|
}
|
||
|
|
TreeRep* st = (TreeRep*) args[1].addr;
|
||
|
|
if(st->IsDefined()){
|
||
|
|
local.addr =
|
||
|
|
new patternFilterInfoAttr<T>(args[0],getValue(st));
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
case REQUEST: {
|
||
|
|
if(!li){
|
||
|
|
return CANCEL;
|
||
|
|
}
|
||
|
|
result.addr = li->next();
|
||
|
|
return result.addr?YIELD:CANCEL;
|
||
|
|
}
|
||
|
|
|
||
|
|
case CLOSE: {
|
||
|
|
if(li){
|
||
|
|
delete li;
|
||
|
|
local.addr = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
template<class T, class TreeRep>
|
||
|
|
int patternFilterVM2(Word* args, Word& result, int message, Word& local,
|
||
|
|
Supplier s){
|
||
|
|
|
||
|
|
|
||
|
|
patternFilterInfoTuple<T>* li = (patternFilterInfoTuple<T>*) local.addr;
|
||
|
|
|
||
|
|
switch(message){
|
||
|
|
case OPEN: {
|
||
|
|
if(li){
|
||
|
|
delete li;
|
||
|
|
local.addr = 0;
|
||
|
|
}
|
||
|
|
TreeRep* st = (TreeRep*) args[2].addr;
|
||
|
|
if(st->IsDefined()){
|
||
|
|
local.addr =
|
||
|
|
new patternFilterInfoTuple<T>(args[0],getValue(st),
|
||
|
|
( (CcInt*)args[3].addr)->GetValue());
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
case REQUEST: {
|
||
|
|
if(!li){
|
||
|
|
return CANCEL;
|
||
|
|
}
|
||
|
|
result.addr = li->next();
|
||
|
|
return result.addr?YIELD:CANCEL;
|
||
|
|
}
|
||
|
|
|
||
|
|
case CLOSE: {
|
||
|
|
if(li){
|
||
|
|
delete li;
|
||
|
|
local.addr = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
ValueMapping Array and Selection Function
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
ValueMapping patternFilterVM[] = {
|
||
|
|
patternFilterVM1<CcString, SuffixTree>,
|
||
|
|
patternFilterVM1<FText, SuffixTree>,
|
||
|
|
patternFilterVM2<CcString, SuffixTree>,
|
||
|
|
patternFilterVM2<FText, SuffixTree>,
|
||
|
|
|
||
|
|
patternFilterVM1<CcString, CcString>,
|
||
|
|
patternFilterVM1<FText, CcString>,
|
||
|
|
patternFilterVM2<CcString, CcString>,
|
||
|
|
patternFilterVM2<FText, CcString>,
|
||
|
|
|
||
|
|
patternFilterVM1<CcString, FText>,
|
||
|
|
patternFilterVM1<FText, FText>,
|
||
|
|
patternFilterVM2<CcString, FText>,
|
||
|
|
patternFilterVM2<FText, FText>
|
||
|
|
};
|
||
|
|
|
||
|
|
int patternFilterSelect(ListExpr args){
|
||
|
|
|
||
|
|
int len = nl->ListLength(args);
|
||
|
|
ListExpr sufftree;
|
||
|
|
if(len==2){
|
||
|
|
sufftree = nl->Second(args);
|
||
|
|
} else {
|
||
|
|
sufftree = nl->Third(args);
|
||
|
|
}
|
||
|
|
int offset = 0;
|
||
|
|
if(CcString::checkType(sufftree)){
|
||
|
|
offset = 4;
|
||
|
|
} else if(FText::checkType(sufftree)){
|
||
|
|
offset = 8;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
if(Stream<CcString>::checkType(nl->First(args))){
|
||
|
|
return offset;
|
||
|
|
}
|
||
|
|
if(Stream<FText>::checkType(nl->First(args))){
|
||
|
|
return offset + 1;
|
||
|
|
}
|
||
|
|
ListExpr attrList = nl->Second(nl->Second(nl->First(args)));
|
||
|
|
string name = nl->SymbolValue(nl->Second(args));
|
||
|
|
ListExpr type;
|
||
|
|
listutils::findAttribute(attrList,name,type);
|
||
|
|
if(CcString::checkType(type)){
|
||
|
|
return offset + 2;
|
||
|
|
}
|
||
|
|
return offset + 3;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
Specification
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
|
||
|
|
OperatorSpec patternFilterSpec(
|
||
|
|
"stream(T) x suffixtree -> stream(T) , t in {string,text}",
|
||
|
|
" or stream(tuple) x attrName x suffixtree -> stream(tuple) "
|
||
|
|
" _ patternFilter [_ ]",
|
||
|
|
"selects all stream elements whose text is inside the suffixtree",
|
||
|
|
" query tokenize('aa bb') patternFilter[st] count " );
|
||
|
|
|
||
|
|
/*
|
||
|
|
Operator instance
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
Operator patternFilter
|
||
|
|
(
|
||
|
|
"patternFilter", //name
|
||
|
|
patternFilterSpec.getStr(), //specification
|
||
|
|
12, // no of VM functions
|
||
|
|
patternFilterVM, //value mapping
|
||
|
|
patternFilterSelect, //trivial selection function
|
||
|
|
patternFilterTM //type mapping
|
||
|
|
);
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
7 Creating the Algebra
|
||
|
|
|
||
|
|
*/
|
||
|
|
class SuffixTreeAlgebra: public Algebra
|
||
|
|
{
|
||
|
|
public:
|
||
|
|
SuffixTreeAlgebra() :
|
||
|
|
Algebra()
|
||
|
|
{
|
||
|
|
/*
|
||
|
|
7.1 Registration of Types
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
AddTypeConstructor(&SuffixTreeTC);
|
||
|
|
|
||
|
|
//the lines below define that SuffixTree
|
||
|
|
//can be used in places where types of kind DATA are expected
|
||
|
|
SuffixTreeTC.AssociateKind(Kind::DATA());
|
||
|
|
|
||
|
|
/*
|
||
|
|
7.2 Registration of Operators
|
||
|
|
|
||
|
|
*/
|
||
|
|
//addOperator
|
||
|
|
AddOperator(createSuffixTreeInfo(), createSuffixTreeFunction,
|
||
|
|
createSuffixTreeTypeMap);
|
||
|
|
|
||
|
|
AddOperator(createSuffixTreeQuadraticInfo(),
|
||
|
|
createSuffixTreeFunction_quadratic, createSuffixTreeTypeMap);
|
||
|
|
|
||
|
|
AddOperator(patternOccursInfo(), patternOccursFunction,
|
||
|
|
patternOccursTypeMap);
|
||
|
|
|
||
|
|
AddOperator(patternPositionsInfo(), patternPositionsFunction,
|
||
|
|
patternPositionsTypeMap);
|
||
|
|
|
||
|
|
AddOperator(patternCountInfo(), patternCountFunction, patternCountTypeMap);
|
||
|
|
//addOperator
|
||
|
|
AddOperator(longestRepeatedSubstringInfo(),
|
||
|
|
longestRepeatedSubstringFunction, suffixtree_textstreamTypeMap);
|
||
|
|
|
||
|
|
AddOperator(shortestUniqueSubstringInfo(), shortestUniqueSubstringFunction,
|
||
|
|
suffixtree_textstreamTypeMap);
|
||
|
|
|
||
|
|
AddOperator(longestCommonSubstringInfo(), longestCommonSubstringFunction,
|
||
|
|
longestCommonSubstringTypeMap);
|
||
|
|
|
||
|
|
AddOperator(maximalUniqueMatchesInfo(), maximalUniqueMatchesFunction,
|
||
|
|
maximalUniqueMatchesTypeMap);
|
||
|
|
//addOperator
|
||
|
|
AddOperator(circularStringLinearizationInfo(),
|
||
|
|
circularStringLinearizationFunction, circularStringLinearizationTypeMap);
|
||
|
|
|
||
|
|
AddOperator(kMismatchInfo(), kMismatchFunction, kMismatchTypeMap);
|
||
|
|
|
||
|
|
AddOperator(equalSuffixTreeInfo(), equalSuffixTreeFun,
|
||
|
|
equalSuffixTreeTypeMap);
|
||
|
|
|
||
|
|
AddOperator(&patternFilter);
|
||
|
|
|
||
|
|
}
|
||
|
|
~SuffixTreeAlgebra()
|
||
|
|
{
|
||
|
|
}
|
||
|
|
;
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
8 Initialization
|
||
|
|
|
||
|
|
Each algebra module needs an initialization function. The algebra
|
||
|
|
manager has a reference to this function if this algebra is
|
||
|
|
included in the list of required algebras, thus forcing the linker
|
||
|
|
to include this module.
|
||
|
|
|
||
|
|
The algebra manager invokes this function to get a reference to the
|
||
|
|
instance of the algebra class and to provide references to the
|
||
|
|
global nested list container (used to store constructor, type,
|
||
|
|
operator and object information) and to the query processor.
|
||
|
|
|
||
|
|
The function has a C interface to make it possible to load the
|
||
|
|
algebra dynamically at runtime.
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
} // end of namespace ~sta~
|
||
|
|
|
||
|
|
extern "C" Algebra*
|
||
|
|
InitializeSuffixTreeAlgebra(NestedList* nlRef, QueryProcessor* qpRef,
|
||
|
|
AlgebraManager* amRef)
|
||
|
|
{
|
||
|
|
nl = nlRef;
|
||
|
|
qp = qpRef;
|
||
|
|
am = amRef;
|
||
|
|
|
||
|
|
return (new sta::SuffixTreeAlgebra());
|
||
|
|
|
||
|
|
}
|
||
|
|
|