Files
secondo/Algebras/SuffixTree/UkkonenTreeBuilder.cpp
2026-01-23 17:03:45 +08:00

213 lines
6.0 KiB
C++

/*
----
This file is part of SECONDO.
Copyright (C) 2004, University in Hagen, Department of Computer Science,
Database Systems for New Applications.
SECONDO is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
SECONDO is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SECONDO; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
----
01590 Fachpraktikum "Erweiterbare Datenbanksysteme"
WS 2011 / 2012
Svenja Fuhs
Regine Karg
Jan Kristof Nidzwetzki
Michael Teutsch
C[ue]neyt Uysal
----
//paragraph [1] Title: [{\Large \bf \begin{center}] [\end{center}}]
//paragraph [10] Footnote: [{\footnote{] [}}]
//[TOC] [\tableofcontents]
[1] Implementation of the Ukkonen algorithm to create suffix trees
[TOC]
1 Includes and Defines
*/
#include <iostream>
#include "UkkonenTreeBuilder.h"
using namespace std;
/*
1.1 Function ~CreateSuffixTree~
Creates a suffix tree using the Ukkonen algorithm.
Uses the following tree extension rules:
Rule 1: insertion point is at a leaf
-> update end index
Rule 2: character at the insertion point does not match
-> create new edge / subdivide edge
Rule 3: suffix is already present
-> do nothing
*/
SuffixTreeVertex* UkkonenTreeBuilder::CreateSuffixTree(std::string *nText)
{
//phase 0: initialize tree
SuffixTreeVertex *root = new SuffixTreeVertex(nText);
SuffixTreeEdge *edge = new SuffixTreeEdge(0);
root->InsertEdge(edge);
//initialize edge vector to properly set end indexes
vector<SuffixTreeEdge*> edgeVector;
edgeVector.push_back(edge);
SuffixTreeVertex *curVertex = root;
size_t curVertexDepth = 0;
SuffixTreeEdge *curEdge = nullptr;
SuffixTreeVertex *lastVertex = NULL;
//start at extension 1, as extension 0 is always present
size_t curExtension = 1;
for (size_t curPhase = 1; curPhase <= nText->length(); curPhase++)
{
//rule 1 (implicit)
SuffixTreeEdge::SetGlobalEndIndex(curPhase);
bool isSuffixPresent;
do
{
isSuffixPresent = false;
//traverse the tree until the update index lies on the current edge
while (curExtension + curVertexDepth < curPhase)
{
curEdge = curVertex->GetEdge((*nText)[curExtension + curVertexDepth]);
curVertexDepth += curEdge->GetLength();
curVertex = curEdge->GetChild();
}
if (curExtension + curVertexDepth == curPhase)
{
//update at a vertex / edge start --------------------------------
curEdge = curVertex->GetEdge((*nText)[curExtension + curVertexDepth]);
if (curEdge != NULL)
{
//rule 3
isSuffixPresent = true;
}
else
{
//rule 2
edge = new SuffixTreeEdge(curPhase);
edgeVector.push_back(edge);
curVertex->InsertEdge(edge);
}
//set suffix link
if (lastVertex != NULL)
{
lastVertex->SetSuffixLink(curVertex);
lastVertex = NULL;
}
}
else
{
//update inside an edge ----------------------------------------
//set current vertex back to edge start
curVertex = curEdge->GetParent();
curVertexDepth -= curEdge->GetLength();
//calculate inter-edge update index
size_t interEdgeIdx = curPhase - curExtension - curVertexDepth;
size_t absUpdateIdx = curEdge->GetStartIndex() + interEdgeIdx;
if ((*nText)[curPhase] == (*nText)[absUpdateIdx])
{
//rule 3
isSuffixPresent = true;
}
else
{
//rule 2
//create new vertex and edges
curVertex = new SuffixTreeVertex(nText);
//edge leading to leaf -> no preset end index
if (curEdge->GetChild() == NULL)
{
edge = new SuffixTreeEdge(
absUpdateIdx,
curEdge->GetChild());
}
else
{
edge = new SuffixTreeEdge(
absUpdateIdx,
curEdge->GetEndIndex(),
curEdge->GetChild());
}
edgeVector.push_back(edge);
curVertex->InsertEdge(edge);
edge = new SuffixTreeEdge(curPhase);
edgeVector.push_back(edge);
curVertex->InsertEdge(edge);
//update current edge
curEdge->SetEndIndex(absUpdateIdx - 1);
curEdge->SetChild(curVertex);
//set suffix link
if (lastVertex != NULL)
{
lastVertex->SetSuffixLink(curVertex);
}
lastVertex = curVertex;
//update depth in tree
curVertexDepth += interEdgeIdx;
}
}
//compute extensions as long as rule 3 does not apply
if (!isSuffixPresent)
{
//if no suffix link exists from the current vertex (not root),
//the parent vertex must have it
if ((curVertex->GetSuffixLink() == NULL) && (curVertex != root))
{
SuffixTreeEdge *parentEdge = curVertex->GetParentEdge();
curVertex = parentEdge->GetParent();
curVertexDepth -= parentEdge->GetLength();
}
if (curVertex->GetSuffixLink() != NULL)
{
curVertex = curVertex->GetSuffixLink();
curVertexDepth--;
}
curExtension++;
}
} while (!isSuffixPresent && (curExtension <= curPhase));
}
//set absolute end indexes to leaf edges
for (vector<SuffixTreeEdge*>::iterator iter = edgeVector.begin();
iter != edgeVector.end(); iter++)
{
if ((*iter)->GetChild() == NULL)
{
(*iter)->SetEndIndex(nText->length()-1);
}
}
return root;
}