669 lines
15 KiB
C++
669 lines
15 KiB
C++
|
|
/*
|
|
----
|
|
This file is part of SECONDO.
|
|
|
|
Copyright (C) 2014, University in Hagen, Department of Computer Science,
|
|
Database Systems for New Applications.
|
|
|
|
SECONDO is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
SECONDO is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with SECONDO; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
----
|
|
|
|
*/
|
|
|
|
#include <map>
|
|
#include <vector>
|
|
#include <sstream>
|
|
|
|
namespace regexcreator{
|
|
|
|
/** Representation of a regular expression as a tree **/
|
|
|
|
class RegEx{
|
|
|
|
enum RegExType {EPSILON,SIMPLE,OR,CONCAT,STAR,PLUS};
|
|
|
|
public:
|
|
|
|
/*
|
|
|
|
Creates an epsilon transition
|
|
|
|
*/
|
|
RegEx(): type(EPSILON), part1(0), part2(0), value(0){ }
|
|
|
|
/*
|
|
Creates a transition using symbol
|
|
|
|
*/
|
|
explicit RegEx(const char symbol):
|
|
type(SIMPLE), part1(0), part2(0), value(symbol){ }
|
|
|
|
|
|
/**
|
|
The copy constructor
|
|
|
|
**/
|
|
RegEx(const RegEx& src): type(src.type),part1(0), part2(0), value(src.value){
|
|
if(src.part1!=0){
|
|
part1 = new RegEx(*src.part1);
|
|
}
|
|
if(src.part2!=0){
|
|
part2 = new RegEx(*src.part2);
|
|
}
|
|
}
|
|
|
|
/**
|
|
|
|
The assignement operator
|
|
|
|
**/
|
|
RegEx& operator=(const RegEx& src){
|
|
type = src.type;
|
|
value = src.value;
|
|
if(part1) {delete part1, part1=0;}
|
|
if(part2) {delete part2, part2=0;}
|
|
if(src.part1){
|
|
part1 = new RegEx(*src.part1);
|
|
}
|
|
if(src.part2){
|
|
part2 = new RegEx(*src.part2);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
/*
|
|
~Destructor~
|
|
|
|
*/
|
|
~RegEx(){
|
|
if(part1) delete part1;
|
|
if(part2) delete part2;
|
|
}
|
|
|
|
|
|
/**
|
|
Checks whether this reg ex is an epsilon transition
|
|
|
|
*/
|
|
|
|
bool isEpsilon() const{
|
|
return type==EPSILON;
|
|
}
|
|
|
|
|
|
/*
|
|
Check for equality
|
|
|
|
*/
|
|
bool equals(const RegEx& r) const{
|
|
if(type!=r.type) return false;
|
|
if(value!=r.value) return false;
|
|
if(part1==0 && r.part1!=0){
|
|
return false;
|
|
}
|
|
if(part1!=0 && r.part1==0){
|
|
return false;
|
|
}
|
|
if(part2==0 && r.part2!=0){
|
|
return false;
|
|
}
|
|
if(part2!=0 && r.part2==0){
|
|
return false;
|
|
}
|
|
if(part1!=0 && !part1->equals(*r.part1)){
|
|
return false;
|
|
}
|
|
if(part2!=0 && !part2->equals(*r.part2)){
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
Creates a new reg ex by connecting two reg ex by or.
|
|
|
|
*/
|
|
static RegEx makeor(RegEx r1, RegEx r2){
|
|
|
|
// A | A = A
|
|
if(r1.equals(r2)){
|
|
return RegEx(r1);
|
|
}
|
|
|
|
// A+ | € = A*
|
|
if(r1.type==PLUS && r2.type==EPSILON){
|
|
RegEx p1(*r1.part1);
|
|
return star(p1);
|
|
}
|
|
// A*B | B = A*B
|
|
if(r1.type==CONCAT && r1.part2->equals(r2) && r1.part1->type==STAR ){
|
|
return RegEx(r1);
|
|
}
|
|
// AB* | A = AB*
|
|
if(r1.type==CONCAT && r1.part1->equals(r2) && r1.part2->type==STAR){
|
|
return RegEx(r1);
|
|
}
|
|
// A+B | B = A*B
|
|
if(r1.type==CONCAT && r1.part1->type==PLUS && r1.part2->equals(r2)){
|
|
return concat(star(*r1.part1->part1),r2);
|
|
}
|
|
// AB+ | A = AB*
|
|
if(r1.type==CONCAT && r1.part1->equals(r2) && r1.part2->type==PLUS){
|
|
return concat(r2, star(*r1.part2->part1));
|
|
}
|
|
// AB | CB = (A | C )B
|
|
if(r1.type==CONCAT && r2.type==CONCAT && r1.part2->equals(*r2.part2)){
|
|
return concat( makeor(*r1.part1,*r2.part1), *r1.part2);
|
|
}
|
|
// AB | AC = A (B | C)
|
|
if(r1.type==CONCAT && r2.type==CONCAT && r1.part1->equals(*r2.part1)){
|
|
return concat(*r1.part1,makeor(*r1.part2,*r2.part2));
|
|
}
|
|
|
|
// switch r1 and r2 and try the same rule again
|
|
|
|
RegEx r3(r1);
|
|
r1 = r2;
|
|
r2 = r3;
|
|
// A+ | € = A*
|
|
if(r1.type==PLUS && r2.type==EPSILON){
|
|
RegEx p1(*r1.part1);
|
|
return star(p1);
|
|
}
|
|
// A*B | B = A*B
|
|
if(r1.type==CONCAT && r1.part2->equals(r2) && r1.part1->type==STAR ){
|
|
return RegEx(r1);
|
|
}
|
|
// AB* | A = AB*
|
|
if(r1.type==CONCAT && r1.part1->equals(r2) && r1.part2->type==STAR){
|
|
return RegEx(r1);
|
|
}
|
|
// A+B | B = A*B
|
|
if(r1.type==CONCAT && r1.part1->type==PLUS && r1.part2->equals(r2)){
|
|
return concat(star(*r1.part1->part1),r2);
|
|
}
|
|
// AB+ | A = AB*
|
|
if(r1.type==CONCAT && r1.part1->equals(r2) && r1.part2->type==PLUS){
|
|
return concat(r2, star(*r1.part2->part1));
|
|
}
|
|
// AB | CB = (A | C )B
|
|
if(r1.type==CONCAT && r2.type==CONCAT && r1.part2->equals(*r2.part2)){
|
|
return concat( makeor(*r1.part1,*r2.part1), *r1.part2);
|
|
}
|
|
// AB | AC = A (B | C)
|
|
if(r1.type==CONCAT && r2.type==CONCAT && r1.part1->equals(*r2.part1)){
|
|
return concat(*r1.part1,makeor(*r1.part2,*r2.part2));
|
|
}
|
|
|
|
// damn, a no rule fits
|
|
RegEx res;
|
|
res.type = OR;
|
|
res.part1 = new RegEx(r2);
|
|
res.part2 = new RegEx(r1);
|
|
return res;
|
|
}
|
|
|
|
|
|
/*
|
|
builts the concatenation of r1 and r2.
|
|
|
|
*/
|
|
static RegEx concat(RegEx r1, RegEx r2){
|
|
// €A = A
|
|
if(r1.type==EPSILON){
|
|
return RegEx(r2);
|
|
}
|
|
// A€ = A
|
|
if(r2.type==EPSILON){
|
|
return RegEx(r1);
|
|
}
|
|
// A*A* = A*
|
|
if(r1.type==STAR && r1.equals(r2)){
|
|
return RegEx(r1);
|
|
}
|
|
// AA* = A+
|
|
if(r2.type==STAR && r2.part1->equals(r1)){
|
|
return plus(r1);
|
|
}
|
|
// A*A = A+
|
|
if(r1.type==STAR && r1.part1->equals(r1)){
|
|
return plus(r2);
|
|
}
|
|
RegEx res;
|
|
res.type = CONCAT;
|
|
res.part1 = new RegEx(r1);
|
|
res.part2 = new RegEx(r2);
|
|
return res;
|
|
}
|
|
|
|
static RegEx star(RegEx& r){
|
|
if(r.type==EPSILON){
|
|
return RegEx(r);
|
|
}
|
|
if(r.type==STAR || r.type==PLUS){
|
|
r = *r.part1;
|
|
}
|
|
RegEx res;
|
|
res.type=STAR;
|
|
res.part1 = new RegEx(r);
|
|
return res;
|
|
}
|
|
|
|
static RegEx plus(RegEx& r){
|
|
if(r.type==EPSILON){
|
|
return RegEx(r);
|
|
}
|
|
if(r.type==STAR || r.type==PLUS){
|
|
return RegEx(r);
|
|
}
|
|
RegEx res;
|
|
res.type = PLUS;
|
|
res.part1 = new RegEx(r);
|
|
return res;
|
|
}
|
|
|
|
|
|
std::stringstream& printTo(std::stringstream& ss){
|
|
bool b;
|
|
switch(type){
|
|
case EPSILON : ss << "€"; break;
|
|
case SIMPLE : ss << value; break;
|
|
case OR : part1->printTo(ss) << "|" ;
|
|
part2->printTo(ss);
|
|
break;
|
|
case CONCAT : b = part1->type==OR;
|
|
if(b) ss << "(";
|
|
part1->printTo(ss);
|
|
if(b) ss << ")";
|
|
b = part2->type==OR;
|
|
if(b) ss << "(";
|
|
part2->printTo(ss);
|
|
if(b) ss << ")";
|
|
break;
|
|
case STAR : b = part1->type!=EPSILON
|
|
&& part1->type!=SIMPLE;
|
|
if(b) ss << "(";
|
|
part1->printTo(ss);
|
|
if(b) ss << ")";
|
|
ss << "*";
|
|
break;
|
|
case PLUS : b = part1->type!=EPSILON
|
|
&& part1->type!=SIMPLE;
|
|
if(b) ss << "(";
|
|
part1->printTo(ss);
|
|
if(b) ss << ")";
|
|
ss << "+";
|
|
break;
|
|
|
|
}
|
|
return ss;
|
|
}
|
|
|
|
private:
|
|
|
|
RegExType type;
|
|
RegEx* part1;
|
|
RegEx* part2;
|
|
char value;
|
|
|
|
};
|
|
|
|
|
|
|
|
class Edge{
|
|
|
|
public:
|
|
|
|
Edge(int _target, std::string label):
|
|
target(_target), regex() {
|
|
if(label.size()>0){
|
|
regex = RegEx(label[0]);
|
|
}
|
|
}
|
|
|
|
Edge(const Edge& e): target(e.target), regex(e.regex) {}
|
|
|
|
Edge(const int _target, const RegEx& _regex):
|
|
target(_target), regex(_regex){
|
|
}
|
|
|
|
Edge& operator=(const Edge& e){
|
|
target = e.target;
|
|
regex = e.regex;
|
|
return *this;
|
|
}
|
|
|
|
~Edge(){}
|
|
|
|
|
|
RegEx getLabel() const{
|
|
return regex;
|
|
}
|
|
|
|
int getTarget() const {
|
|
return target;
|
|
}
|
|
|
|
private:
|
|
|
|
int target;
|
|
RegEx regex;
|
|
};
|
|
|
|
|
|
|
|
class DeaRegEx{
|
|
|
|
|
|
public:
|
|
|
|
|
|
/*
|
|
Creates an empty DEA
|
|
|
|
*/
|
|
DeaRegEx(): successors(), predecessors(), loops(), start(1), finals() {}
|
|
|
|
/*
|
|
Copy constructor
|
|
|
|
*/
|
|
DeaRegEx(const DeaRegEx& src):
|
|
successors(src.successors), predecessors(src.predecessors),
|
|
loops(src.loops),
|
|
start(src.start), finals(src.finals){}
|
|
|
|
/*
|
|
Destructor
|
|
|
|
*/
|
|
|
|
~DeaRegEx(){
|
|
for(size_t i=0;i<successors.size();i++){
|
|
if(successors[i]){
|
|
delete successors[i];
|
|
delete predecessors[i];
|
|
delete loops[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
Assigment operator
|
|
|
|
*/
|
|
DeaRegEx& operator=(const DeaRegEx& src){
|
|
successors = src.successors;
|
|
predecessors = src.predecessors;
|
|
loops = src.loops;
|
|
start = src.start;
|
|
finals = src.finals;
|
|
return *this;
|
|
}
|
|
|
|
/*
|
|
Sets the start state of this automaton
|
|
|
|
*/
|
|
void setStart(const int s){
|
|
start = s;
|
|
}
|
|
|
|
|
|
/*
|
|
Adds a final state to this automaton
|
|
|
|
*/
|
|
void addFinal(const int i){
|
|
finals.push_back(i);
|
|
}
|
|
|
|
|
|
/*
|
|
Add a transition to this automaton.
|
|
|
|
*/
|
|
|
|
void addEdge(int source, int target, char label){
|
|
addEdge(source,target, RegEx(label));
|
|
}
|
|
|
|
void addEdge(int source, int target, RegEx label){
|
|
|
|
// extend vectors for non existing source
|
|
for(int i=successors.size();i<=source;i++){
|
|
successors.push_back(0);
|
|
predecessors.push_back(0);
|
|
loops.push_back(0); // epsilon loop
|
|
}
|
|
// extend vectors for non existing target
|
|
for(int i=successors.size();i<=target;i++){
|
|
successors.push_back(0);
|
|
predecessors.push_back(0);
|
|
loops.push_back(0);
|
|
}
|
|
if(!successors[source]){
|
|
successors[source] = new std::vector<Edge>();
|
|
predecessors[source] = new std::vector<Edge>();
|
|
loops[source] = new RegEx();
|
|
}
|
|
if(!successors[target]){
|
|
successors[target] = new std::vector<Edge>();
|
|
predecessors[target] = new std::vector<Edge>();
|
|
loops[target] = new RegEx();
|
|
}
|
|
if(source!=target){
|
|
successors[source]->push_back(Edge(target,label));
|
|
predecessors[target]->push_back(Edge(source,label));
|
|
} else {
|
|
RegEx* l = loops[source];
|
|
if(l->isEpsilon()){
|
|
delete loops[source];
|
|
loops[source] = new RegEx(label);
|
|
} else {
|
|
RegEx l2 = RegEx::makeor(*l,label);
|
|
delete loops[source];
|
|
loops[source] = new RegEx(l2);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
RegEx* computeRegEx(const bool usePrio){
|
|
|
|
if(successors.size()==0u || finals.size()==0u || start<0
|
|
|| (uint32_t)start >= successors.size()
|
|
|| successors[start]==0u){
|
|
return 0;
|
|
}
|
|
|
|
// add new start and new end if required
|
|
if( predecessors[start]->size()>0 // link to start
|
|
|| finals.size()>0 // more than one final
|
|
|| successors[finals[0]]->size()>0 // final has successors
|
|
|| !loops[start]->isEpsilon() // start contains a loop
|
|
|| !loops[finals[0]]->isEpsilon()){ // final contains a loop
|
|
extend();
|
|
}
|
|
|
|
|
|
// now, the automaton has exactly one start state without back edge to it
|
|
// and exactly one final state having no back edges from it
|
|
|
|
// replace parallel edges by regular expressions
|
|
|
|
uint32_t f = finals[0];
|
|
const uint32_t start = (uint32_t) this->start;
|
|
|
|
for(size_t i=0;i<successors.size();i++){
|
|
removeParallel(i);
|
|
}
|
|
|
|
|
|
// remove all nodes except s and f from the automaton
|
|
if(!usePrio){
|
|
for(size_t i=0;i<successors.size();i++){
|
|
if(i!=(uint32_t)start && i!=f){
|
|
removeNode(i);
|
|
}
|
|
}
|
|
} else {
|
|
int nodes = successors.size()-2; // remove all nodes excpet start and end
|
|
while(nodes > 0){
|
|
int index = -1;
|
|
int min = -1;
|
|
// search node where successors * predecsssors + loops is minimal
|
|
|
|
for(size_t n=0;n<successors.size();n++){
|
|
if(n!=start && n != f && successors[n]!=0){
|
|
int ne = successors[n]->size() * predecessors[n]->size();
|
|
if(!loops[n]->isEpsilon()){
|
|
ne++;
|
|
}
|
|
if(index < 0 || ne < min){
|
|
index = n;
|
|
min = ne;
|
|
}
|
|
}
|
|
}
|
|
removeNode(index);
|
|
nodes--;
|
|
}
|
|
}
|
|
|
|
// now, all nodes except start and final are removed
|
|
|
|
removeParallel(start);
|
|
|
|
|
|
return new RegEx(successors[start]->at(0).getLabel());
|
|
}
|
|
|
|
|
|
private:
|
|
|
|
std::vector<std::vector<Edge>* > successors;
|
|
std::vector<std::vector<Edge>* > predecessors;
|
|
std::vector<RegEx*> loops;
|
|
int start;
|
|
std::vector<int> finals;
|
|
|
|
|
|
/*
|
|
Adds a new start and a new final state to this automaton.
|
|
|
|
*/
|
|
void extend(){
|
|
int s = successors.size();
|
|
int f = s+1;
|
|
RegEx eps;
|
|
addEdge(s,start,eps);
|
|
|
|
for(size_t i=0;i<finals.size();i++) {
|
|
int of = finals[i];
|
|
addEdge(of,f,eps);
|
|
}
|
|
start = f-1;
|
|
finals.clear();
|
|
finals.push_back(f);
|
|
}
|
|
|
|
/*
|
|
Merges parallel edges in the automaton
|
|
|
|
*/
|
|
void removeParallel(){
|
|
for(size_t i=0;i<successors.size();i++){
|
|
removeParallel(i);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
Removes node ~node~ from the automaton
|
|
|
|
*/
|
|
void removeNode(int node){
|
|
std::vector<Edge>* suc = successors[node];
|
|
std::vector<Edge>* pred = predecessors[node];
|
|
RegEx* loop1 = loops[node];
|
|
RegEx loop = RegEx::star(*loop1);
|
|
successors[node] = 0;
|
|
predecessors[node] = 0;
|
|
loops[node] = 0;
|
|
|
|
for(size_t p=0;p<pred->size();p++){
|
|
Edge pe = pred->at(p);
|
|
removeTarget(successors[pe.getTarget()],node);
|
|
for(size_t s=0;s<suc->size();s++){
|
|
Edge su = suc->at(s);
|
|
if(p==0){
|
|
removeTarget(predecessors[su.getTarget()],node);
|
|
}
|
|
int src = pe.getTarget();
|
|
int target = su.getTarget();
|
|
RegEx peLab = pe.getLabel();
|
|
RegEx suLab = su.getLabel();
|
|
RegEx complete = RegEx::concat(RegEx::concat(peLab,loop),suLab);
|
|
addEdge(src,target,complete);
|
|
removeParallel(src);
|
|
}
|
|
}
|
|
delete suc;
|
|
delete pred;
|
|
delete loop1;
|
|
}
|
|
|
|
void removeTarget(std::vector<Edge>* v, int target){
|
|
for(int i=v->size()-1;i>=0;i--){
|
|
if(v->at(i).getTarget()==target){
|
|
v->erase(v->begin()+i);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// merges edges starting at src and
|
|
void removeParallel(int src){
|
|
std::vector<Edge>* suc = successors[src];
|
|
std::map<int,RegEx> amap;
|
|
for(size_t i=0;i<suc->size();i++){
|
|
Edge e = suc->at(i);
|
|
int t = e.getTarget();
|
|
if(amap.find(t) == amap.end()){
|
|
amap[t] = e.getLabel();
|
|
} else {
|
|
RegEx label = e.getLabel();
|
|
amap[t] = RegEx::makeor(amap[t],e.getLabel());
|
|
}
|
|
}
|
|
std::vector<Edge>* nsuc = new std::vector<Edge>();
|
|
std::map<int,RegEx>::iterator it;
|
|
for(it = amap.begin();it!=amap.end();it++){
|
|
nsuc->push_back(Edge(it->first,it->second));
|
|
int target = it->first;
|
|
removeTarget(predecessors[target],src);
|
|
predecessors[target]->push_back(Edge(src,it->second));
|
|
}
|
|
delete successors[src];
|
|
successors[src] = nsuc;
|
|
}
|
|
|
|
};
|
|
|
|
|
|
} // end of namespace regexcreator
|