/* ---- This file is part of SECONDO. Copyright (C) 2004, University in Hagen, Department of Computer Science, Database Systems for New Applications. SECONDO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. SECONDO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with SECONDO; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ---- //paragraph [1] Title: [{\Large \bf \begin{center}] [\end{center}}] //paragraph [10] Footnote: [{\footnote{] [}}] //[TOC] [\tableofcontents] //[_] [\_] [1] Implementation of the Cluster Algebra June, 2006. Basic functionality, one operator with default values and one with maximal distance and minimal number of points as values. Only the type 'points' has been implemented so far. [TOC] 1 Overview This implementation file essentially contains the implementation of the classes ~ClusterAlgebra~ and ~DBccan~ which contains the actual cluster algorithm. 2 Defines and Includes Eps is used for the clusteralgorithm as the maximal distance, the minimum points (MinPts) may be apart. If there are further points in the Eps-range to one of the points in the cluster, this point (and further points from this on) belong to the same cluster. */ #include "Algebra.h" #include "NestedList.h" #include "QueryProcessor.h" #include "StandardTypes.h" #include "Algebras/Spatial/SpatialAlgebra.h" #include "LogMsg.h" #include "MMRTree.h" #include #include #include #include extern NestedList* nl; extern QueryProcessor* qp; using namespace std; namespace clusteralg{ #define MINIMUMPTS_DEF 4 // default min points - MinPts #define EPS_DEF 400 // default max distance - Eps class DBscan; class DBscan { public: DBscan(); DBscan(Word*, Word&, int, Word&, Supplier, double**); int Parameter_Standard(double**,int); int Parameter_UserDefined(double**, int, int, int);//MinPts(int), Eps(int) void CopyToResult(Word*, Word&, int, Word&, Supplier, double**); private: int MinPts;//minimum number of points to be a cluster int Eps;//max distance for MinPts and further points in cluster int FindClusters(double**, int); // main method bool ExpandCluster(double**, int,int); void Search(double**, int,int, int*); }; /* 3.1 Type mapping function ~PointsTypeMapA~. Used for the ~cluster\_a~ operator with one argument (points object). Type mapping for ~cluster\_a~ is ---- points [->] points ---- */ static ListExpr PointsTypeMapA( ListExpr args ) { if ( nl->ListLength(args) == 1 ) { ListExpr arg1 = nl->First(args); if ( nl->IsEqual(arg1, Points::BasicType()) ) return nl->SymbolAtom(Points::BasicType()); } return nl->SymbolAtom(Symbol::TYPEERROR()); } /* 3.2 Type mapping function ~PointsTypeMapB~. Used for the ~cluster\_b~ operator with three arguments (points object, Eps). Type mapping for ~cluster\_b~ is ---- points[MinPts, Eps] [->] points ---- */ static ListExpr PointsTypeMapB( ListExpr args) { ListExpr arg1, arg2, arg3; if ( nl->ListLength(args) == 3 ) { arg1 = nl->First(args); // points arg2 = nl->Second(args); // MinPts --> int arg3 = nl->Third(args); // Eps --> int if ( ( nl->IsEqual(arg1, Points::BasicType())) && ( nl->IsEqual(arg2, CcInt::BasicType())) && ( nl->IsEqual(arg3, CcInt::BasicType()))) return nl->SymbolAtom(Points::BasicType()); } return nl->SymbolAtom(Symbol::TYPEERROR()); } static ListExpr cluster_c_TM(ListExpr args){ if(nl->ListLength(args)!=3){ ErrorReporter::ReportError("points x int x real expected"); return nl->TypeError(); } if(nl->IsEqual(nl->First(args),Points::BasicType()) && nl->IsEqual(nl->Second(args),CcInt::BasicType()) && nl->IsEqual(nl->Third(args),CcReal::BasicType())){ return nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()), nl->SymbolAtom(Points::BasicType())); } ErrorReporter::ReportError("points x int x real expected"); return nl->TypeError(); } static ListExpr cluster_d_TM(ListExpr args){ if((nl->ListLength(args)==2) && (nl->IsEqual(nl->First(args),Points::BasicType())) && (nl->IsEqual(nl->Second(args),CcReal::BasicType()))){ return nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()), nl->SymbolAtom(Points::BasicType())); } ErrorReporter::ReportError("points x real expected"); return nl->TypeError(); } /* 5.1 Value mapping function for operator ~cluster\_a~. Predefined values for Eps (distance) and MinPts (minimum number of points) for the cluster algorithm are used. First an array with four columns is set up, a pointer array is being used in all the DBscan-class functions for access to this array. */ int cluster_aFun (Word* args, Word& result, int message, Word& local, Supplier s) { Points* ps = ((Points*)args[0].addr); int cpoints_size = ps->Size(); double** cpoints; // pointer-array to cpoints double* rcpoints; // real cpoints int nrows = ps->Size(); int ncols = 4; // cpoints: 0:x 1:y 2:border point 3:core point int a; // DATA array setup // allocate memory for array 'rcpoints' rcpoints = (double*) malloc(nrows * ncols * sizeof(double)); if (rcpoints == NULL) { printf("\nFailure to allocate room for the array\n"); exit(0); } // allocate memory for pointers to rows cpoints = (double**) malloc(nrows * sizeof(double *)); if (cpoints == NULL) { printf("\nFailure to allocate room for the pointers\n"); exit(0);} // point the pointers for (a = 0; a < nrows; a++) cpoints[a] = rcpoints + (a * ncols); // preset NOISE (0) and CORECHECK (0) for(a=0; a < nrows; a++){ cpoints[a][2] = 0.0; cpoints[a][3] = 0.0; } // copy x/y from input into cluster array 'cpoints' ps->StartBulkLoad(); // relax ordering if(ps->IsEmpty()) { ((Points*)result.addr)->SetDefined(false); free(rcpoints); free(cpoints); return 1; } for(int a = 0; a < ps->Size();a++) // transfer x/y-values { Point p; // to cluster array ps->Get(a, p); cpoints[a][0] = p.GetX(); cpoints[a][1] = p.GetY();} // end for ps->EndBulkLoad(true, false); // for testing copy input to output //Points* ps2; //ps2 = ps->Clone(); //(Points*)result.addr = ps2; // comment rest of function, if used /* Create an instance of DBscan. */ DBscan cluster; /* Here the no-parameter default setup function is being called, which itself calls the actual cluster algorithm. */ a = cluster.Parameter_Standard(cpoints ,cpoints_size); // debugging if ( RTFlag::isActive("ClusterText:Trace") ) {cmsg.file() << "Cluster:" " " "cluster_aFun Ergebnis: " << a << endl; cmsg.send();} /* Copy the result from the internal array 'cpoints' back into the result 'points' memory location. */ cluster.CopyToResult(args,result, message, local, s, cpoints); free(rcpoints); return 0; } /* 5.2 Value mapping function for operator ~cluster\_b~. This function receives tweo arguments: Eps and MinPts, which are used for the cluster algorithm. The first part ist identical to operator ~cluster\_a~. */ int cluster_bFun (Word* args, Word& result, int message, Word& local, Supplier s) { Points* ps = ((Points*)args[0].addr); int cpoints_size = ps->Size(); double** cpoints; // pointer-array to cpoints double* rcpoints; // real cpoints int nrows = ps->Size(); int ncols = 4; // cpoints: 0:x 1:y 2:border point 3:core point int a; // DATA array setup // allocate memory for array 'rcpoints' rcpoints = (double*) malloc(nrows * ncols * sizeof(double)); if (rcpoints == NULL) { printf("\nFailure to allocate room for the array\n"); exit(0); } // allocate memory for pointers to rows cpoints = (double**) malloc(nrows * sizeof(double *)); if (cpoints == NULL) { printf("\nFailure to allocate room for the pointers\n"); exit(0);} // point the pointers for (a = 0; a < nrows; a++) cpoints[a] = rcpoints + (a * ncols); // preset NOISE (0) and CORECHECK (0) for(a=0; a < nrows; a++){ cpoints[a][2] = 0.0; cpoints[a][3] = 0.0;} // debugging if ( RTFlag::isActive("ClusterText:B") ) { cmsg.file() << "Cluster: cluster_bFun: " << endl; cmsg.send();} // copy x/y from input into cluster array 'cpoints' ps->StartBulkLoad(); // relax ordering if(ps->IsEmpty()) { ((Points*)result.addr)->SetDefined(false); free(rcpoints); free(cpoints); return 1; } for(int a = 0; a < ps->Size();a++) // transfer x/y-values { Point p; // to cluster array ps->Get(a, p); cpoints[a][0] = p.GetX(); cpoints[a][1] = p.GetY();} // end for ps->EndBulkLoad(true, false); // for testing copy input to output //Points* ps2; //ps2 = ps->Clone(); //(Points*)result.addr = ps2; // comment rest of function, if used DBscan cluster; // create DBscan object /* The following part is different from ~cluster\_a~. The parameters are eing used for the cluster algorithm. 1: MinPts (int) 2: Eps (int) */ CcInt* i1; CcInt* i2; i1 = ((CcInt*)args[1].addr); i2 = ((CcInt*)args[2].addr); int cMinPts = i1->GetIntval(); int cEps = i2->GetIntval(); //debugging if ( RTFlag::isActive("ClusterText:Trace") ) { cmsg.file() << "Cluster: cMinPts: ---------" << cMinPts << endl; cmsg.file() << "Cluster: cEps -------------" << cEps << endl; cmsg.send();} a = cluster.Parameter_UserDefined(cpoints, cpoints_size, cMinPts, cEps); // find cluster-points with user-defined parameters // returns number of clusters found if ( RTFlag::isActive("ClusterText:Trace") ) {cmsg.file() << "Cluster: cluster_bFun Ergebnis: " << a << endl; cmsg.send();} cluster.CopyToResult(args, result, message, local, s, cpoints); free(rcpoints); return 0; } /* 5.3 Value Mapping function for cluster[_]c */ class ClusterC_LocalInfo{ public: /* ~Constructor~ Creates a new local info from the value coming from the value mapping. */ ClusterC_LocalInfo(Points* pts, CcInt* minPts, CcReal* eps){ if(!pts->IsDefined() || !minPts->IsDefined() || !eps->IsDefined()){ defined = false; return; } this->pts = pts; this->minPts = max(0,minPts->GetIntval()); this->eps = eps->GetRealval(); this->eps2 = this->eps*this->eps; defined = true; size = pts->Size(); no = new int[size]; env1 = new set*[size]; pos = 0; // set all points to be UNCLASSIFIED // and clean all sets for(int i=0;i(); } computeEnv(); pos = 0; clusterId = 1; } /* ~Destructor~ */ ~ClusterC_LocalInfo(){ if(defined){ for(int i=0;i= 0 ){ // point already classified pos++; } else if ( env1[pos]->size()** env1; // environments; int size; // number of points int pos; // current position int clusterId; // current cluster id static const int UNCLASSIFIED = -1; static const int NOISE = -2; /* ~computeEnv~ This function computes the epsilon environment for each point contained in pts; */ void computeEnv(){ mmrtree::Rtree<2> tree(10,30); Point p; double min1[2]; double max1[2]; /* insert all contained points into an R- tree */ for(int i=0;iSize();i++){ pts->Get(i,p); double x = p.GetX(); double y = p.GetY(); min1[0] = x - FACTOR; min1[1] = y - FACTOR; max1[0] = x + FACTOR; max1[1] = y - FACTOR; Rectangle<2> box(true,min1,max1); tree.insert(box, i); } /* compute environments using filter /refine */ for(int i=0;iSize();i++){ pts->Get(i,p); set cands; double x = p.GetX(); double y = p.GetY(); min1[0] = x-eps; min1[1] = y-eps; max1[0] = x+eps; max1[1] = y+eps; Rectangle<2> searchbox(true, min1,max1); tree.findAll(searchbox,cands); set::iterator it; for(it = cands.begin(); it!=cands.end(); it++){ Point p2; int cand = static_cast(*it); pts->Get(cand,p2); if(qdist(p,p2)insert(cand); } } } } /* ~qdist~ This function computes the square of the distance between two point value. */ double qdist(Point& p1, Point& p2){ double x1 = p1.GetX(); double x2 = p2.GetX(); double y1 = p1.GetY(); double y2 = p2.GetY(); double dx = x1-x2; double dy = y1-y2; return dx*dx + dy*dy; } /* ~expand~ This function implements the expand algorithm of dbscan. */ Points* expand(int pos){ Points* result = new Points(minPts); result->StartBulkLoad(); set seeds = *env1[pos]; no[pos] = clusterId; Point p; pts->Get(pos,p); (*result) += (p); seeds.erase(pos); while(!seeds.empty()){ int cpos = *(seeds.begin()); if(no[cpos]<0){ // not classified by another cluster no[cpos] = clusterId; pts->Get(cpos,p); (*result) += (p); set::iterator it; for(it = env1[cpos]->begin();it!=env1[cpos]->end(); it++){ if(no[*it]<0){ if(env1[*it]->size()>=minPts){ // a core point seeds.insert(*it); } else { // border point no[*it] = clusterId; pts->Get(*it,p); (*result) += p; } } } } seeds.erase(cpos); } result->EndBulkLoad(); return result; } }; int cluster_cFun (Word* args, Word& result, int message, Word& local, Supplier s) { switch(message){ case OPEN : { Points* pts = static_cast(args[0].addr); CcInt* minPts = static_cast(args[1].addr); CcReal* eps = static_cast(args[2].addr); local.setAddr(new ClusterC_LocalInfo(pts,minPts,eps)); return 0; } case REQUEST : { if(local.addr==0){ return CANCEL; } ClusterC_LocalInfo* linfo = static_cast(local.addr); Points* hasNext = linfo->getNext(); result.setAddr(hasNext); if(hasNext){ return YIELD; } else { return CANCEL; } } case CLOSE : { if(local.addr!=0){ delete static_cast(local.addr); local.setAddr(0); } return 0; } } return -1; // should never be reached } class ClusterG_LocalInfo{ public: /* ~Constructor~ Creates a new local info from the value coming from the value mapping. */ ClusterG_LocalInfo(Points* pts, CcInt* minPts, CcReal* eps){ if(!pts->IsDefined() || !minPts->IsDefined() || !eps->IsDefined()){ defined = false; return; } this->pts = pts; this->minPts = max(0,minPts->GetIntval()); this->eps = eps->GetRealval(); this->eps2 = this->eps*this->eps; defined = true; size = pts->Size(); no = new int[size]; pos = 0; // set all points to be UNCLASSIFIED // and clean all sets for(int i=0;i= 0 ){ // point already classified pos++; } else { set* env = getEnv(pos); unsigned int size = env->size(); delete env; if ( size* tree; static const int UNCLASSIFIED = -1; static const int NOISE = -2; void createTree(){ tree = new mmrtree::Rtree<2>(10,30); Point p; double min1[2]; double max1[2]; /* insert all contained points into an R- tree */ for(int i=0;iSize();i++){ pts->Get(i,p); double x = p.GetX(); double y = p.GetY(); min1[0] = x - FACTOR; min1[1] = y - FACTOR; max1[0] = x + FACTOR; max1[1] = y - FACTOR; Rectangle<2> box(true,min1,max1); tree->insert(box, i); } } set* getEnv(int pos){ set* res = new set(); Point p; pts->Get(pos,p); set cands; double x = p.GetX(); double y = p.GetY(); double min1[2]; double max1[2]; min1[0] = x-eps; min1[1] = y-eps; max1[0] = x+eps; max1[1] = y+eps; Rectangle<2> searchbox(true, min1,max1); tree->findAll(searchbox,cands); set::iterator it; for(it = cands.begin(); it!=cands.end(); it++){ Point p2; int cand = static_cast(*it); pts->Get(cand,p2); if(qdist(p,p2)insert(cand); } } return res; } unsigned int getEnvSize(int pos){ int res = 0; Point p; double min1[2]; double max1[2]; pts->Get(pos,p); set cands; double x = p.GetX(); double y = p.GetY(); min1[0] = x-eps; min1[1] = y-eps; max1[0] = x+eps; max1[1] = y+eps; Rectangle<2> searchbox(true, min1,max1); tree->findAll(searchbox,cands); set::iterator it; for(it = cands.begin(); it!=cands.end(); it++){ Point p2; int cand = static_cast(*it); pts->Get(cand,p2); if(qdist(p,p2)StartBulkLoad(); set* seeds = getEnv(pos); no[pos] = clusterId; Point p; pts->Get(pos,p); (*result) += (p); seeds->erase(pos); while(!seeds->empty()){ int cpos = *(seeds->begin()); if(no[cpos]<0){ // not classified by another cluster no[cpos] = clusterId; pts->Get(cpos,p); (*result) += (p); tree->erase(p.BoundingBox(),cpos); set::iterator it; set* env = getEnv(cpos); for(it = env->begin();it!=env->end(); it++){ if(no[*it]<0){ if(getEnvSize(*it)>=minPts){ // a core point seeds->insert(*it); } else { // border point no[*it] = clusterId; pts->Get(*it,p); (*result) += p; tree->erase(p.BoundingBox(),*it); } } } delete env; } seeds->erase(cpos); } result->EndBulkLoad(); delete seeds; return result; } }; int cluster_gFun (Word* args, Word& result, int message, Word& local, Supplier s) { switch(message){ case OPEN : { Points* pts = static_cast(args[0].addr); CcInt* minPts = static_cast(args[1].addr); CcReal* eps = static_cast(args[2].addr); local.setAddr(new ClusterG_LocalInfo(pts,minPts,eps)); return 0; } case REQUEST : { if(local.addr==0){ return CANCEL; } ClusterG_LocalInfo* linfo = static_cast(local.addr); Points* hasNext = linfo->getNext(); result.setAddr(hasNext); if(hasNext){ return YIELD; } else { return CANCEL; } } case CLOSE : { if(local.addr!=0){ delete static_cast(local.addr); local.setAddr(0); } return 0; } } return -1; // should never be reached } /* ClusterD ValueMapping */ class Edge{ public: int src; int dest; double cost; Edge(int src, int dest, double cost){ this->src = src; this->dest = dest; this->cost = cost; } Edge(const Edge& edge){ equalize(edge); } ~Edge(){} Edge& operator=(const Edge& src){ equalize(src); return *this; } bool operator<(const Edge& e)const{ if(cost < e.cost) return true; if(cost > e.cost) return false; if(src < e.src) return true; if(src > e.src) return false; return dest < e.dest; } bool operator>(const Edge& e) const{ if(cost > e.cost) return true; if(cost < e.cost) return false; if(src > e.src) return true; if(src < e.src) return false; return dest > e.dest; } bool operator==(const Edge& e)const{ return cost == e.cost && src == e.src && dest == e.dest; } ostream& printTo(ostream& o)const{ return (o << "(" << src << " -> " << dest << ", " << cost <<")"); } private: void equalize(const Edge& src){ this->cost = src.cost; this->dest = src.dest; this->src = src.src; } }; ostream& operator<<(ostream& o, const Edge& e){ return e.printTo(o); } ostream& operator<<(ostream& o, const set& e){ set::iterator it; o << "{"; for(it = e.begin(); it!=e.end();it++){ if(it!=e.begin()){ o << ", "; } o << *it; } o << "}"; return o; } struct intset{ intset():member(),refs(1){} void deleteIfAllowed(){ refs--; if(refs<1){ delete this; } } set member; int refs; }; struct cCluster{ cCluster(){ cx = 0.0; cy = 0.0; member = new intset(); forbidden = false; } cCluster(const cCluster& c){ cx = c.cx; cy = c.cy; forbidden = c.forbidden; member = c.member; member->refs++; } cCluster& operator=(const cCluster& c){ cx = c.cx; cy = c.cy; forbidden = c.forbidden; member = c.member; member->refs++; return *this; } ~cCluster(){ member->deleteIfAllowed(); } set::iterator begin(){ return member->member.begin(); } set::iterator end(){ return member->member.end(); } size_t size(){ return member->member.size(); } void insert(int i){ member->member.insert(i); } void erase(int i){ member->member.erase(i); } void clear(){ member->member.clear(); } double cx; double cy; intset* member; // avoid copying of this set !!!! bool forbidden; }; class ClusterD_LocalInfo{ public: /* ~Constructor~ Creates a new localinfo for the cluster[_]d operator. The complete clustering is done here. */ ClusterD_LocalInfo(Points* pts, CcReal* eps){ env = 0; currentInitialCluster = 0; currentInitialPos = 0; pts->Copy(); if(pts->IsDefined() && eps->IsDefined()){ this->defined = true; this->eps = eps->GetRealval(); this->eps2 = this->eps*this->eps; this->pts = pts; size = pts->Size(); icluster = new int[size]; for(int i=0;idefined = 0; this->pts = 0; this->eps = 0.0; } } /* ~Destructor~ Destroys this instance. */ ~ClusterD_LocalInfo(){ pts->DeleteIfAllowed(); if(icluster){ delete[] icluster; icluster=0; } if(fcluster){ delete[] fcluster; fcluster=0; } if(env){ for(int i=0;i** env; int currentInitialPos; int currentCNum; set* currentInitialCluster; map > currentFinalCluster; map >::iterator currentFinalPos; int* origPos; /* ~computeEnv~ This function computes the epsilon environment for each point contained in pts; */ void computeEnv(){ env = new set*[size]; mmrtree::Rtree<2> tree(10,30); Point p; double min1[2]; double max1[2]; /* insert all contained points into an R- tree */ for(int i=0;iSize();i++){ pts->Get(i,p); double x = p.GetX(); double y = p.GetY(); min1[0] = x - FACTOR; min1[1] = y - FACTOR; max1[0] = x + FACTOR; max1[1] = y - FACTOR; Rectangle<2> box(true,min1,max1); tree.insert(box, i); } /* compute environments using filter /refine */ for(int i=0;iSize();i++){ env[i] = new set(); pts->Get(i,p); set cands; double x = p.GetX(); double y = p.GetY(); min1[0] = x-eps; min1[1] = y-eps; max1[0] = x+eps; max1[1] = y+eps; Rectangle<2> searchbox(true, min1,max1); tree.findAll(searchbox,cands); set::iterator it; for(it = cands.begin(); it!=cands.end(); it++){ Point p2; int cand = static_cast(*it); pts->Get(cand,p2); if(qdist(p,p2)insert(cand); } } } } /* ~qdist~ ~qdist~ computes the square of the distance between two points */ double qdist(const Point& p1, const Point& p2){ double x1 = p1.GetX(); double x2 = p2.GetX(); double y1 = p1.GetY(); double y2 = p2.GetY(); double dx = x1-x2; double dy = y1-y2; return dx*dx + dy*dy; } double qdist(double x1, double y1, double x2, double y2){ double dx = x1-x2; double dy = y1-y2; return dx*dx + dy*dy; } /* ~getNextInitialCluster~ This function extracts the next initial clsuter from the original point. If the poinst value is exhausted, null is returned. The caller of this function has to delete the returned value. */ set* getNextInitialCluster(){ while(currentInitialPos=0){ currentInitialPos++; } if(currentInitialPos>=size){ // set exhausted return 0; } set* res = new set(); set seed(*env[currentInitialPos]); while(!seed.empty()){ int p = *(seed.begin()); if(icluster[p]<0){ // point is "free" icluster[p] = 1; // mark as assigned res->insert(p); for(set::iterator it=env[p]->begin(); it!= env[p]->end(); it++){ if(icluster[*it]<0){ seed.insert(*it); } } } seed.erase(p); } return res; } /* ~computeFinalCluster0~ Divides the points value ps into a set of clusters. The first number is set to cnum. Cnum is increased automatically. */ void computeFinalCluster0(){ currentFinalCluster.clear(); const int size = currentInitialCluster->size(); // store all edges into an vector and // build a single cluster for each point Point p_i; Point p_j; vector edges; int tmpfcluster[size]; if(origPos){ delete[] origPos; } origPos = new int[size]; // initialize origPos for(int i=0;i rev; int pos = 0; set::iterator it1; for(it1=currentInitialCluster->begin(); it1!=currentInitialCluster->end(); it1++){ rev[*it1] = pos; pos++; } pos = 0; for(it1=currentInitialCluster->begin(); it1!=currentInitialCluster->end(); it1++){ origPos[pos] = *it1; tmpfcluster[pos] = pos; //each point builds its own cluster set s; s.insert(pos); currentFinalCluster[pos] = s; pos++; set* e = env[*it1]; set::iterator it2; Point p1; pts->Get(*it1,p1); for(it2=e->begin();it2!=e->end();it2++){ Point p2; pts->Get(*it2,p2); double dist = qdist(p1,p2); int src = rev[*it1]; int dest = rev[*it2]; if(src::iterator it; // insert edges and connect clusters for(it = edges.begin(); it!=edges.end();it++){ Edge e = *it; int c1 = tmpfcluster[e.src]; int c2 = tmpfcluster[e.dest]; if(c1!=c2){ // otherwise the points are already in the same cluster // compute the maximum distance between points of c1 // and points of c2 set::iterator it1,it2; set s1 = currentFinalCluster[c1]; set s2 = currentFinalCluster[c2]; double dist = 0.0; for(it1=s1.begin(); it1!=s1.end() && dist <= eps2; it1++){ pts->Get(origPos[*it1],p_i); for(it2=s2.begin();it2!=s2.end() && dist <= eps2; it2++){ pts->Get(origPos[*it2],p_j); dist = max(dist,qdist(p_i,p_j)); } } if(dist <=eps2){ // build the union of the clusters for(it2=s2.begin();it2!=s2.end();it2++){ s1.insert(*it2); tmpfcluster[*it2] = c1; } currentFinalCluster.erase(c1); currentFinalCluster.erase(c2); currentFinalCluster[c1] = s1; } } } currentFinalPos = currentFinalCluster.begin(); // at this currentFinalPos = currentFinalCluster.begin(); } /* Another method for dividing a group into several subgroups. */ void insertPoint(vector& clusters, int pos){ Point p; pts->Get(pos,p); double x = p.GetX(); double y = p.GetY(); // first cluster if(clusters.empty()){ cCluster cl; cl.cx = x; cl.cy = y; cl.forbidden = false; cl.insert(pos); clusters.push_back(cl); return; } int index = -1; double bestDist=eps2+10; for(unsigned int i=0;i clusters[i].size())))){ index = i; bestDist = dist; } } } if((index < 0) || (bestDist > eps2)){ // no cluster found, produce a new one cCluster cl; cl.cx = x; cl.cy = y; cl.forbidden = false; cl.insert(pos); clusters.push_back(cl); return; } // insert the point into the best cluster clusters[index].cx = (clusters[index].cx * clusters[index].size() + x ) / (clusters[index].size() +1); clusters[index].cy = (clusters[index].cy * clusters[index].size() + y ) / (clusters[index].size() +1 ); clusters[index].insert(pos); // check whether some points are outside the cluster Point pc(true,clusters[index].cx, clusters[index].cy); set removed; set::iterator it; Point p2; for(it = clusters[index].begin(); it != clusters[index].end(); it++){ pts->Get(*it,p2); if(qdist(pc,p2) > eps2){ removed.insert(*it); } } // remove 'bad' points from the cluster double sx = 0.0; double sy = 0.0; for(it = removed.begin(); it!=removed.end();it++){ Point p3; pts->Get(*it,p3); sx += p3.GetX(); sy += p3.GetY(); clusters[index].erase(*it); } /* // we avoid to correct the center again because of this correction // further points may go out from the cluster // thiy may lead to long running times // correct the center clusters[index].cx = ((clusters[index].cx * (clusters[index].member.size() + removed.size())) - sx) / clusters[index].member.size(); clusters[index].cy = ((clusters[index].cy * (clusters[index].member.size() + removed.size())) - sy) / clusters[index].member.size(); */ // inserts the points again clusters[index].forbidden = true; for(it = removed.begin(); it!=removed.end();it++){ insertPoint(clusters,*it); } clusters[index].forbidden = false; } void insertPointSimple(vector& clusters, int pos){ Point p; pts->Get(pos,p); double x = p.GetX(); double y = p.GetY(); // clusters has to be non-empty int index = 0; int size = clusters.size(); double bestDist = qdist(x,y,clusters[index].cx,clusters[index].cy); for(int i=1;i clusters[i].size()))){ index = i; bestDist = dist; } } if(!(bestDist <= eps2)){ cout << "Error a point was not assigned to a cluster " << endl; cout << "The position of the point was " << pos << endl; cout << "The dist is " << bestDist << endl; cout << "Allowed dist " << eps2 << endl; cout << "Best cluster " << index << endl; cout << "#cluster " << size << endl; assert(false); } clusters[index].insert(pos); } void computeFinalCluster1(){ currentFinalCluster.clear(); vector currentCluster; // insert the points set::iterator it1; for( it1 = currentInitialCluster->begin(); it1 != currentInitialCluster->end(); it1++){ insertPoint(currentCluster,*it1); } // by the movement of the center, some // clusters may have unhandsome overlappings // we will redistribute the points located in such // overlappings // redistribute the points vector::iterator it3; for(it3 = currentCluster.begin(); it3 != currentCluster.end(); it3++){ ((*it3)).clear(); } for( it1 = currentInitialCluster->begin(); it1 != currentInitialCluster->end(); it1++){ insertPointSimple(currentCluster,*it1); } // copy the result into currentFinalCluster int i = 0; vector::iterator it2; for(it2 = currentCluster.begin(); it2 != currentCluster.end(); it2++){ currentFinalCluster[i++] = (*it2).member->member; } currentFinalPos = currentFinalCluster.begin(); } /* Returns the next cluster */ Points* getNextFinalCluster(int method=0){ if(!defined) return 0; if(!currentInitialCluster){ currentInitialCluster = getNextInitialCluster(); if(currentInitialCluster){ switch(method){ case 0: computeFinalCluster0(); break; case 1: computeFinalCluster1(); break; default: assert(false); } } else { return 0; } } else if(currentFinalPos == currentFinalCluster.end()){ delete currentInitialCluster; currentInitialCluster = getNextInitialCluster(); if(currentInitialCluster){ switch(method){ case 0: computeFinalCluster0(); break; case 1: computeFinalCluster1(); break; default: assert(false); } } else{ return 0; } } set cs = (*currentFinalPos).second; Points* res = new Points(cs.size()); res->StartBulkLoad(); Point p; set::iterator it; for(it = cs.begin();it!=cs.end();it++){ switch(method){ case 0 : pts->Get(origPos[*it],p); break; case 1 : pts->Get(*it,p); break; default : assert(false); } (*res) += p; } res->EndBulkLoad(); currentFinalPos++; return res; } }; template int cluster_dFun (Word* args, Word& result, int message, Word& local, Supplier s) { switch(message){ case OPEN : { Points* pts = static_cast(args[0].addr); CcReal* eps = static_cast(args[1].addr); local.setAddr(new ClusterD_LocalInfo(pts,eps)); return 0; } case REQUEST : { if(local.addr==0){ return CANCEL; } ClusterD_LocalInfo* linfo = static_cast(local.addr); Points* hasNext = linfo->getNext(i); result.setAddr(hasNext); if(hasNext){ return YIELD; } else { return CANCEL; } } case CLOSE : { if(local.addr!=0){ delete static_cast(local.addr); local.addr = 0; } return 0; } } return -1; // should never be reached } /* 1.3 Value Mapping for cluster[_]f Cluster[_]f implements the same slgoithms as cluster[_]e. The difference is that this algoritms avoids the preprocessing step to avoid large allocations of memory. Instead of that, an R-tree is used to manage the centers. 1.3.1 LocalInfo */ class ClusterF_LocalInfo{ public: /* ~Constructor~ Here, the complete work is done. */ ClusterF_LocalInfo(Points* pts, CcReal* eps){ this->pts = static_cast(pts->Copy()); if(!pts->IsDefined() || !eps->IsDefined()){ defined = false; cluster = 0; size = 0; } else { defined = true; size = pts->Size(); this->eps = max(FACTOR,eps->GetRealval()); this->eps2 = this->eps * this->eps; cluster = new vector(); computeCluster(); pos=0; no_cluster = cluster->size(); } } /* ~Destructor~ Destroys this structure. */ ~ClusterF_LocalInfo(){ pts->DeleteIfAllowed(); if(cluster){ delete cluster; } } /* ~getNext~ This function returns the next cluster as a points value. */ Points* getNext(){ if(!defined){ return 0; } else if (pos>= no_cluster){ return 0; } else { Points* res = new Points((*cluster)[pos].size()); res->StartBulkLoad(); set::iterator it; Point p; for(it = (*cluster)[pos].begin(); it != (*cluster)[pos].end(); it++){ pts->Get(*it,p); (*res) += p; } res->EndBulkLoad(); pos++; return res; } } /* 2 Private Part */ private: /* ~data members~ */ Points* pts; // source points value double eps; // maximum deviation double eps2; // = eps * eps bool defined; // true if the inputs are correct int size; // = pts->Size() int pos; // the current cluster int no_cluster; // number of clusters vector* cluster; // the clusters /* ~qdist~ Returns the square of the Euclidean distance between the points defined by (x1, y1) and (x2,y2). */ double qdist(const double x1,const double y1, const double x2, const double y2) const{ double dx = x2-x1; double dy = y2-y1; return dx*dx + dy * dy; } /* ~indexOfNearestCluster~ Computes the index of the cluster whose center is closest to p within the cluster vector. If all clusters have a minimum distance larger than eps, -1 will be returned. The r-tree is used as index and has to contain all cluster centers. */ int indexOfNearestCluster(const mmrtree::Rtree<2>& tree, const Point& p) const{ int res = -1; double bestDist = eps2 + 10.0; double min[2]; double max[2]; double x = p.GetX(); double y = p.GetY(); min[0] = x - eps - FACTOR; min[1] = y - eps - FACTOR; max[0] = x + eps + FACTOR; max[1] = y + eps + FACTOR; Rectangle<2> searchbox(true,min,max); set cands; tree.findAll(searchbox,cands); set::iterator it; for(it = cands.begin(); it != cands.end(); it++){ cCluster c = cluster->at(*it); double d = qdist(c.cx,c.cy,x,y); if(d <= eps2 && d < bestDist && !c.forbidden){ bestDist = d; res = *it; } } return res; } /* ~insertPointSimple~ This function assigns the point at position ~pos~ in the ~pts~ member variable to the nearest cluster w.r.t. its center. The cluster itself remains unchanged, i.e. the center is not moved. */ void insertPointSimple(const mmrtree::Rtree<2>& tree, const int pos){ Point p; pts->Get(pos,p); int index = indexOfNearestCluster(tree,p); assert(index >= 0); (*cluster)[index].insert(pos); } /* ~insertPoint~ Inserts a point to the nearest cluster. If no appropriate cluster is found, a new one is created. The center of the cluster is changed to be the center of all points within the cluster including that one at positon ~pos~. Thereby, some points of the cluster may exceed the maximum distance to the cluster's center. Such points are reinserted recursively but the source cluster is locked. */ void insertPoint(mmrtree::Rtree<2>& tree, const int pos){ Point p; pts->Get(pos,p); int index = indexOfNearestCluster(tree,p); double min[2]; double max[2]; double x = p.GetX(); double y = p.GetY(); if(index <0){ // no appropriate cluster found, build a new one cCluster c; c.cx = x; c.cy = y; c.insert(pos); c.forbidden = false; cluster->push_back(c); min[0] = x - FACTOR; min[1] = y - FACTOR; max[0] = x + FACTOR; max[1] = y + FACTOR; Rectangle<2> box(true,min,max); tree.insert(box,cluster->size()-1); return; } (*cluster)[index].insert(pos); double cx = (*cluster)[index].cx; double cy = (*cluster)[index].cy; int s = (*cluster)[index].size(); (*cluster)[index].cx = ( (cx * (s - 1.0) + x) / s); (*cluster)[index].cy = ( (cy * (s - 1.0) + y) / s); min[0] = cx - FACTOR; min[1] = cy - FACTOR; max[0] = cx + FACTOR; max[1] = cy + FACTOR; Rectangle<2> erasebox(true,min,max); tree.erase(erasebox, index); min[0] = (*cluster)[index].cx - FACTOR; min[1] = (*cluster)[index].cy - FACTOR; max[0] = (*cluster)[index].cx + FACTOR; max[1] = (*cluster)[index].cy + FACTOR; Rectangle<2> newCenter(true,min,max); tree.insert(newCenter,index); repairClusterAt(index,tree); } /* ~repairClusterAt~ Removes all points exceeding the maximum allowed distance to the cluster's center from the cluster at ~index~. Such points are reinserted. */ void repairClusterAt(const int index, mmrtree::Rtree<2>& tree){ (*cluster)[index].forbidden = true; double cx = (*cluster)[index].cx; double cy = (*cluster)[index].cy; set wrong; set::iterator it; Point p; for(it = (*cluster)[index].begin(); it!= (*cluster)[index].end(); it++){ pts->Get(*it,p); double d = qdist(cx,cy, p.GetX(),p.GetY()); if(d>eps2){ wrong.insert(*it); } } for( it=wrong.begin(); it!=wrong.end(); it++){ (*cluster)[index].erase(*it); } for( it=wrong.begin(); it!=wrong.end(); it++){ insertPoint(tree,*it); } (*cluster)[index].forbidden = false; } /* ~computeCluster~ This function divides a points value into a set of clusters. */ void computeCluster(){ mmrtree::Rtree<2> tree(3,6); for(int i=0;isize();i++){ (*cluster)[i].clear(); } for(int i=0;i(args[0].addr); CcReal* eps = static_cast(args[1].addr); local.setAddr(new ClusterF_LocalInfo(pts,eps)); return 0; } case REQUEST : { if(local.addr==0){ return CANCEL; } ClusterF_LocalInfo* linfo = static_cast(local.addr); Points* hasNext = linfo->getNext(); result.setAddr(hasNext); if(hasNext){ return YIELD; } else { return CANCEL; } } case CLOSE : { if(local.addr!=0){ delete static_cast(local.addr); local.addr = 0; } return 0; } } return -1; // should never be reached } /* 6.1 Specification Strings for Operator cluster\_a */ const string cluster_aSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" " "\"Example\" ) " "( points -> points" "cluster_a ( _ )" "Find cluster for" " points with standard cluster parameters. " "[ADVICE: Do not use.]" "query cluster_a (Kneipen)" ") )"; /* 6.2 Specification Strings for Operator cluster\_b */ const string cluster_bSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" " "\"Example\" ) " "( points -> points" "_ cluster_b [_, _] " "Find cluster for" " points with parameters MinPts (1) and Eps (2). " "[ADVICE: Do not use.]" "query Kneipen cluster_b[5,200]" ") )"; /* 6.3 Specification string for Operator cluster\_c */ const string cluster_cSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" " "\"Example\" ) " "( points x int x real -> stream(points)" " _ cluster_c [ minpts, epsilon ] " "For a point set given as a points value, compute the clusters using " "the DBSCAN algorithm with parameters minPts (minimum number of points " "forming a cluster core) and epsilon (maximum distance between points in " "a cluster core). " "Returns a stream of points values (point sets) representing the clusters. " "" "query Kneipen cluster_c[5,200.0] count" ") )"; const string cluster_gSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" " "\"Example\" ) " "( points x int x real -> stream(points)" " _ cluster_g [ minpts, epsilon] " "For a point set given as a points value, compute the clusters using " "the DBSCAN algorithm with parameters minPts (minimum number of points " "forming a cluster core) and epsilon (maximum distance between points in " "a cluster core). [Alternative implementation?]." "Returns a stream of points values (point sets) representing the clusters. " "query Kneipen cluster_g[5,200.0] count" ") )"; /* 6.4 Specification string for Operator cluster\_d */ const string cluster_dSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" " "\"Example\" ) " "( points x real -> stream(points)" " _ cluster_d [ maxdist ] " "For a point set given as a points value, compute the clusters " "using the a distance-based clustering algorithm with parameter maxdist " "(maximum distance between points within a cluster. Returns a stream of " "points values (point sets) representing the clusters. " "query Kneipen cluster_b[200.0] count" ") )"; /* 7.1 Operator cluster\_a */ Operator cluster_a ( "cluster_a", //name cluster_aSpec, //specification cluster_aFun, //value mapping Operator::SimpleSelect, //trivial selection function PointsTypeMapA //type mapping ); /* 7.2 Operator cluster\_b */ Operator cluster_b ( "cluster_b", //name cluster_bSpec, //specification cluster_bFun, //value mapping Operator::SimpleSelect, //trivial selection function PointsTypeMapB //type mapping ); /* 7.3 Operator cluster[_]c */ Operator cluster_c ( "cluster_c", //name cluster_cSpec, //specification cluster_cFun, //value mapping Operator::SimpleSelect, //trivial selection function cluster_c_TM //type mapping ); Operator cluster_g ( "cluster_g", //name cluster_gSpec, //specification cluster_gFun, //value mapping Operator::SimpleSelect, //trivial selection function cluster_c_TM //type mapping // equals to c ); /* 7.4 Operator cluster[_]d */ Operator cluster_d ( "cluster_d", //name cluster_dSpec, //specification cluster_dFun<0>, //value mapping Operator::SimpleSelect, //trivial selection function cluster_d_TM //type mapping ); Operator cluster_e ( "cluster_e", //name cluster_dSpec, //specification cluster_dFun<1>, //value mapping Operator::SimpleSelect, //trivial selection function cluster_d_TM //type mapping ); Operator cluster_f ( "cluster_f", //name cluster_dSpec, //specification cluster_fFun, //value mapping Operator::SimpleSelect, //trivial selection function cluster_d_TM //type mapping ); /* 10.1 class DBscan (cluster algorithm) */ DBscan::DBscan() // Constructor { //Default Constructor - does nothing return; } /* 10.2 Function FindClusters This function is being called through the 'Parameter-' functions, which set up Eps and MinPts. It loops through each point and passes it on to the 'ExpandCluster' function if the point has not been classified as a cluster member yet. */ int DBscan::FindClusters(double** cpoints, int cpoints_size){ int point; // counter float percentage = 0.0; int anzahl = 0; // iterate all cpoints for(point=0; point < cpoints_size; point++) if (cpoints[point][2] == 0.0) // not yet classified as cluster member if(!ExpandCluster(cpoints, cpoints_size, point)) if ( RTFlag::isActive("ClusterText:Trace") ) { cmsg.file() << "Cluster: Problem with ExpandCluster " << endl; cmsg.send();} // calculate percentage of cluster-cpoints for(point=0; point < cpoints_size; point++) if(cpoints[point][2] > 0) { percentage++; anzahl++; } percentage = percentage/(float)point; if ( RTFlag::isActive("ClusterText:Trace") ) { cmsg.file() << "Cluster: Percentage: " << (percentage*100) << endl << " EPS: " << Eps << endl << " MinPts: " << MinPts << endl; cmsg.send();} return (int)(percentage*100); // return percentage of points in cluster } /* 10.3 Function Parameter\_Standard This function only sets MinPts and Eps to the \#DEFINE values and calls the function 'FindClusters'. */ int DBscan::Parameter_Standard(double** cpoints,int cpoints_size) { MinPts = MINIMUMPTS_DEF; Eps = EPS_DEF; int res; // call FindClusters res = FindClusters(cpoints,cpoints_size); return res; } /* 10.4 Function Parameter\_UserDefined Similar to the function 'Parameter\_Standard', but sets MinPts and Eps to the parameter values. */ int DBscan::Parameter_UserDefined(double** cpoints,int cpoints_size, int MinPts_user, int Eps_user){ MinPts = MinPts_user; Eps = Eps_user; int res; res = FindClusters(cpoints,cpoints_size); return res; } /* 10.5 Function ExpandCluster This function checks, if the passed point is member of a cluster and - if so - checks for further members. For this, the function 'Search' is being used. */ bool DBscan::ExpandCluster(double** cpoints,int cpoints_size,int point) { int* seeds; int a = 0; seeds = (int*) malloc((cpoints_size) * sizeof(int)); seeds[0]=0; // none yet Search(cpoints, cpoints_size, point, seeds); // seeds: seeds[0] = number of seeds, // seeds[1...] = ('cpoints'-) numbers of Eps-Points if(seeds[0] < MinPts) // no core point - seeds[0] // contains number of points in Eps { cpoints[point][3] = -1.0; // no core point free(seeds); return true;} else // core point { while(a < seeds[0]) { a++; point = seeds[a]; if (cpoints[point][3] < 1.0) // no core point Search(cpoints, cpoints_size, point, seeds); } // end while for(a=1; aResultStorage( s ); // Query Processor provided Points //instance for the result // copy x/y from cluster array back into result (only cluster members) ((Points*)result.addr)->Clear(); ((Points*)result.addr)->StartBulkLoad(); for(int a=0; a < ps->Size(); a++) if(cpoints[a][2] > 0) // cluster member { Point p(true, cpoints[a][0], cpoints[a][1]); //((Points*)result.addr)->InsertPt(p); (*((Points*)result.addr)) += p; } // end if / end for ((Points*)result.addr)->EndBulkLoad(); // clean up, go home free(cpoints); return; } /* 10.7 Function Search This function searches for all points in the 'Eps'-area of each given point and returns these. This function has so far been implemented only as a SLOW each-by-each search. Alternative methods (R[*]-Tree, ...) should be implemented. */ void DBscan::Search(double** cpoints,int cpoints_size, int point, int* seeds){ // return EPS-environment of point in seeds // ... could be implemented as an EFFICIENT r*-tree int a; //int b = seeds[0]+1; int c; int seedcounter = 0; bool check = true; double min1, min2, dist; for(a=0; a < cpoints_size; a++){ min1 = (double)cpoints[point][0]-(double)cpoints[a][0]; min2 = (double)cpoints[point][1]-(double)cpoints[a][1]; dist = sqrt(pow(min1, 2.0) + pow(min2, 2.0)); if(dist <= (double)Eps && point != a) { check = true; for(c=1; c < seeds[0]+1 && check == true; c++) { if(seeds[c] == a) check = false; // don't put the same point into seeds more than once else check = true; } // end for if(check) { // add a (in Eps) ... seed not yet included seeds[0]++; seeds[seeds[0]] = a; } // end if(check... seedcounter++; // used for core-point classification } // end if(dist ... } // end for if (seedcounter > Eps) cpoints[point][3] = 1.0; // core-point classification return; } /* 8.1 Creating the cluster algebra */ class ClusterAlgebra : public Algebra { public: ClusterAlgebra() : Algebra() { AddOperator ( &cluster_a ); AddOperator ( &cluster_b ); AddOperator ( &cluster_c ); AddOperator ( &cluster_g ); AddOperator ( &cluster_d ); AddOperator ( &cluster_e ); AddOperator ( &cluster_f ); ///// tracefile ///// // if ( RTFlag::isActive("ClusterText:Trace") ) { // cmsg.file() << "Cluster: Constructor " << endl; // cmsg.send(); // } ///// tracefile end ///// } ~ClusterAlgebra() {}; }; } // end of namespace clusteralg /* 9.1 Initialization (Standard) Each algebra module needs an initialization function. The algebra manager has a reference to this function if this algebra is included in the list of required algebras, thus forcing the linker to include this module. The algebra manager invokes this function to get a reference to the instance of the algebra class and to provide references to the global nested list container (used to store constructor, type, operator and object information) and to the query processor. The function has a C interface to make it possible to load the algebra dynamically at runtime. */ extern "C" Algebra* InitializeClusterAlgebra( NestedList* nlRef, QueryProcessor* qpRef ) { nl = nlRef; qp = qpRef; ///// tracefile //// // if ( RTFlag::isActive("ClusterText:Trace") ) { // cmsg.file() << "Cluster: InitializeClusterAlgebra " // << endl; cmsg.send(); } return (new clusteralg::ClusterAlgebra()); }