/* ---- This file is part of SECONDO. Copyright (C) 2004, University in Hagen, Department of Computer Science, Database Systems for New Applications. SECONDO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. SECONDO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with SECONDO; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ---- //paragraph [1] Title: [{\Large \bf \begin {center}] [\end {center}}] //[TOC] [\tableofcontents] Started November 2019, Fabio Vald\'{e}s */ #include "Algorithms.h" namespace stj { struct AggEntry { AggEntry(); AggEntry(const TupleId id, const temporalalgebra::SecInterval& iv, Rect& rect, const unsigned int noTuples); ~AggEntry() {} void clear(); void deletePeriods(); ListExpr toListExpr(); unsigned int getNoOccs(const TupleId& id) const; datetime::DateTime getDuration() const {return duration;} void computeCommonTimeInterval(const std::set& commonTupleIds, temporalalgebra::SecInterval& iv); void computeCommonRect(const temporalalgebra::SecInterval& iv, const std::set& commonTupleIds, Geoid *geoid, Rect &rect); void computeSemanticTimeSpec(const std::set& commonTupleIds, std::string& semanticTimeSpec) const; void sequentialJoin(AggEntry& entry1, AggEntry& entry2); std::string print(const TupleId& id = 0) const; std::string print(const Rect& rect) const; std::vector > occs; std::vector occsPos; // maps tuple id to pos in occs vector unsigned int noOccs; // over all tuples datetime::DateTime duration; // over all tuples }; /* Comparison function; sort by * number of tuples with at least one occurrence (support * noTuples) * total duration * total number of occurrences (>= support * noTuples) * lexicographical order of labels */ // struct compareEntries { // bool operator()(NewPair const& l, // NewPair const& r) const { // if (l.second.occurrences.size() != r.second.occurrences.size()) { // return l.second.occurrences.size() > r.second.occurrences.size(); // } // if (l.second.duration != r.second.duration) { // return l.second.duration > r.second.duration; // } // if (left.second.noOccurrences != r.second.noOccurrences) { // return l.second.noOccurrences > r.second.noOccurrences; // } // return l.first < r.first; // } // }; struct comparePMResults { bool operator()(NewPair res1, NewPair res2) { if (res1.second == res2.second) { if (res1.first.length() == res2.first.length()) { return res1.first < res2.first; } return (res1.first.length() < res2.first.length()); } return (res1.second < res2.second); // ascending order } }; struct compareLabelsWithSupp { bool operator()(NewPair lws1, NewPair lws2) const { if (lws1.second == lws2.second) { return (lws1.first < lws2.first); } return (lws1.second > lws2.second); // descending order } }; struct compareLabelSeqs { bool operator()(std::vector seq1, std::vector seq2) const { if (seq1.size() == seq2.size()) { for (unsigned int i = 0; i < seq1.size(); i++) { if (seq1[i] != seq2[i]) { return seq1[i] < seq2[i]; } } return false; } return seq1.size() < seq2.size(); } }; extern TypeConstructor fptreeTC; extern TypeConstructor projecteddbTC; extern TypeConstructor verticaldbTC; extern TypeConstructor splSemTrajTC; /* The original mlabel objects are transformed into a map from a label onto a set of tuple ids (one for every tuple containing the label) together with a time period (the time period in which the label occurs for this tuple), the number of occurrences inside the tuple, and the total duration of its occurrences. */ struct RelAgg { RelAgg(); RelAgg(RelAgg *ra); ~RelAgg() {clearEntries();} void clear(); void clearEntries(); ListExpr entriesToListExpr(); static bool saveToRecord(RelAgg *agg, SmiRecord& valueRecord, size_t& offset); static bool readFromRecord(RelAgg *agg,SmiRecord& valueRecord,size_t& offset); void getLabelSeqFromMLabel(MLabel *ml, std::vector& result); void insertLabelAndBbox(const std::string& label, const TupleId& id, const temporalalgebra::SecInterval& iv, Rect& rect); void scanRelation(Relation *rel, const NewPair attrPos, Geoid *g); void filter(const double ms, const size_t memSize); bool buildAtom(unsigned int label, AggEntry entry, const std::set& commonTupleIds, std::string& atom); void subsetperm(std::vector source, int left, int index, std::vector& labelVec, std::set >& result); void subset(std::vector source, int left, int index, std::vector& labelVec, std::set >& result); void retrieveLabelCombs(const unsigned int size, std::vector& source, std::set >& result); void retrieveLabelSubsets(const unsigned int size, std::vector& source, std::set >& result); double getSupp(unsigned int label); bool canLabelsBeFrequent(std::vector& labelSeq, std::set& intersection); double sequenceSupp(std::vector labelSeq, std::set intersection); void combineApriori(std::set >& frequentLabelCombs, std::set >& labelCombs); void retrievePermutations(std::vector& labelComb, std::set >& labelPerms); void derivePatterns(const int mina, const int maxa); unsigned long long int computeEntriesSize() const; unsigned long long int computeFreqLabelsSize() const; void combineEntries(unsigned int endOfPrefix, RelAgg *ra, unsigned int label, unsigned int minSuppCnt); std::string print(const std::map& contents) const; std::string print(const std::map >& frequentLabels) const; std::string print(const std::set >& labelCombs) const; std::string print(const unsigned int label = UINT_MAX); template std::string print(const std::vector& anyVec) const { std::stringstream result; bool first = true; result << "<"; for (auto it : anyVec) { if (!first) { result << ", "; } first = false; result << it; } result << ">"; return result.str(); } template std::string print(const std::set& anySet) const { std::stringstream result; result << "{"; bool first = true; for (auto it : anySet) { if (!first) { result << ", "; } first = false; result << it; } result << "}"; return result.str(); } unsigned int noTuples, minNoAtoms, maxNoAtoms; std::map entriesMap; // only for initial insertions std::vector entries; std::vector freqLabels; // id for each frequent label std::map labelPos; // label --> pos in freqLabels std::vector > results; double minSupp; Geoid *geoid; Relation *rel; NewPair attrPos; // textual, spatial std::vector, compareLabelSeqs> > checkedSeqs, freqSets, nonfreqSets; // only for fp,pos represents k, avoids repeated supp computations and results }; struct FPNode { FPNode() {} FPNode(const unsigned int l, const unsigned int f, const unsigned int a) : label(l), frequency(f), nodeLink(0), ancestor(a) {} FPNode(const unsigned int l, const unsigned f, const std::vector& c, const unsigned int nl, const unsigned int a) : label(l), frequency(f), children(c), nodeLink(nl), ancestor(a) {} ListExpr toListExpr(std::vector& freqLabels) const; unsigned int label, frequency; std::vector children; // positions of all children unsigned int nodeLink; // position of successor in node link; 0 means no link unsigned int ancestor; // pos of ancestor node }; class FPTree { public: FPTree() {} FPTree(FPTree *tree) : minSupp(tree->minSupp), nodes(tree->nodes), nodeLinks(tree->nodeLinks) {} ~FPTree() {} void clear() {nodes.clear(); nodeLinks.clear();} bool hasNodes() {return !nodes.empty();} bool hasNodeLinks() {return !nodeLinks.empty();} bool hasAggEntries() {return !agg->entries.empty();} unsigned int getNoNodes() {return nodes.size();} unsigned int getNoNodeLinks() {return nodeLinks.size();} bool isChildOf(unsigned int label, unsigned int pos, unsigned int& nextPos); void updateNodeLink(unsigned int label, unsigned int targetPos); void insertLabelVector(const std::vector& labelsOrdered, const unsigned int freq); void construct(); void initialize(const double ms, RelAgg *ra); bool isOnePathTree(); void sortNodeLinks(std::vector& result); void collectPatternsFromSeq(std::vector& labelSeq, const unsigned int minNoAtoms, const unsigned int maxNoAtoms); void computeCondPatternBase(std::vector& labelSeq, std::vector, unsigned int> >& result); FPTree* constructCondTree( std::vector, unsigned int> >& condPB); void mineTree(std::vector& initLabels, const unsigned int minNoAtoms, const unsigned int maxNoAtoms); void retrievePatterns(const unsigned int minNoAtoms, const unsigned int maxNoAtoms); unsigned long long int computeNodesSize() const; unsigned long long int computeNodeLinksSize() const; static const std::string BasicType() {return "fptree";} static ListExpr Property(); static Word In(const ListExpr typeInfo, const ListExpr instance, const int errorPos, ListExpr& errorInfo, bool& correct); ListExpr getNodeLinksList(unsigned int label); static ListExpr Out(ListExpr typeInfo, Word value); static Word Create(const ListExpr typeInfo); static void Delete(const ListExpr typeInfo, Word& w); static bool Save(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value); static bool Open(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value); static void Close(const ListExpr typeInfo, Word& w); static Word Clone(const ListExpr typeInfo, const Word& w); static int SizeOfObj(); static bool TypeCheck(ListExpr type, ListExpr& errorInfo); static bool checkType(ListExpr t) {return listutils::isSymbol(t,BasicType());} double minSupp; unsigned int minSuppCnt; // \ceil{noTuples * minSupp} std::vector nodes; // nodes[0] represents root std::map nodeLinks; // pointer to 1st node of link // std::vector freqLabels; // id for each frequent label RelAgg *agg; }; struct GetPatternsLI { GetPatternsLI(Relation *r, const NewPair ap, double ms, int mina, int maxa, Geoid *g, const size_t mem); ~GetPatternsLI(); static TupleType *getTupleType(); static Tuple *getNextResult(RelAgg& agg, TupleType *tt); TupleType *tupleType; RelAgg agg; }; struct MineFPTreeLI { MineFPTreeLI(FPTree *t, int mina, int maxa); ~MineFPTreeLI(); Tuple* getNextResult(); FPTree *tree; unsigned int minNoAtoms, maxNoAtoms; TupleType *tupleType; }; struct MatrixSq { MatrixSq(unsigned int s = 0) : size(s) { init(size); } ~MatrixSq() { clear(); } void clear() { if (size > 0) { delete[] values; } } void init(unsigned int s) { size = s; if (size > 0) { values = new unsigned int[size * size]; memset(values, 0, size * size * sizeof(unsigned int)); } } unsigned int operator() (unsigned int i, unsigned int j) { if (i >= size || j >= size) { return UINT_MAX; } return values[i * size + j]; } unsigned int* operator[] (unsigned int i) { if (i >= size) { return 0; } return values + i * size; } void increment(unsigned int i, unsigned int j) { if (i >= size || j >= size) { return; } values[i * size + j]++; } void set(unsigned int i, unsigned int j, unsigned int value) { if (i >= size || j >= size) { return; } values[i * size + j] = value; } std::string print() { std::stringstream result; for (unsigned int i = 0; i < size; i++) { for (unsigned int j = 0; j < size; j++) { result << values[i * size + j] << " "; } result << endl; } return result.str(); } unsigned int size; unsigned int *values; }; class ProjectedDB { public: ProjectedDB() {} ProjectedDB(ProjectedDB *pdb) : minSupp(pdb->minSupp) {} ProjectedDB(double ms, unsigned int msc, RelAgg *ra, unsigned int projSize) : minSupp(ms), minSuppCnt(msc), agg(ra) { projections.resize(projSize); } ~ProjectedDB() {} void clear(); void initialize(const double ms, RelAgg *ra); void addProjections(std::vector& labelSeq, unsigned int label, const std::vector& prefix = std::vector()); void computeSMatrix(std::vector& freqLabels, std::vector >& fPos); void construct(); void minePDB(std::vector& prefix, std::string& patPrefix, unsigned int pos, const unsigned int minNoAtoms, const unsigned int maxNoAtoms); void minePDBSMatrix(std::vector& prefix, std::string& patPrefix, unsigned int minNoAtoms, unsigned int maxNoAtoms); void retrievePatterns(const unsigned int minNoAtoms, const unsigned int maxNoAtoms); unsigned long long int computeProjSize() const; unsigned long long int computeMatrixSize() const; unsigned long long int computeFreqLabelPosSize() const; unsigned long long int computeFreqMatrixPosSize() const; static ListExpr seqToListExpr(std::vector& seq); static ListExpr projToListExpr(std::vector >& proj); static const std::string BasicType() {return "projecteddb";} static ListExpr Property(); static Word In(const ListExpr typeInfo, const ListExpr instance, const int errorPos, ListExpr& errorInfo, bool& correct); static ListExpr Out(ListExpr typeInfo, Word value); static Word Create(const ListExpr typeInfo); static void Delete(const ListExpr typeInfo, Word& w); static bool Save(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value); static bool Open(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value); static void Close(const ListExpr typeInfo, Word& w); static Word Clone(const ListExpr typeInfo, const Word& w); static int SizeOfObj(); static bool TypeCheck(ListExpr type, ListExpr& errorInfo); static bool checkType(ListExpr t) {return listutils::isSymbol(t,BasicType());} double minSupp; unsigned int minSuppCnt; // \ceil{noTuples * minSupp} RelAgg *agg; std::vector > > projections; // std::vector projPos; // non-empty positions inside projs MatrixSq smatrix; std::vector freqLabelPos; // maps matrix pos to freqLabel std::vector > freqMatrixPos; }; struct PrefixSpanLI { PrefixSpanLI(ProjectedDB *db, int mina, int maxa); ~PrefixSpanLI(); Tuple* getNextResult(); ProjectedDB *pdb; unsigned int minNoAtoms, maxNoAtoms; TupleType *tupleType; }; class VerticalDB { public: VerticalDB() {} VerticalDB(VerticalDB *vdb) : minSupp(vdb->minSupp) {} VerticalDB(double ms, unsigned int msc, RelAgg *ra) : minSupp(ms), minSuppCnt(msc), agg(ra) {} ~VerticalDB() {} void clear() {} void initialize(const double ms, RelAgg *ra); void construct(); void mineVerticalDB(std::vector& prefix, std::string& patPrefix, RelAgg *ra, const unsigned int minNoAtoms, const unsigned int maxNoAtoms); void retrievePatterns(const unsigned int minNoAtoms, const unsigned int maxNoAtoms); static const std::string BasicType() {return "verticaldb";} static ListExpr Property(); static Word In(const ListExpr typeInfo, const ListExpr instance, const int errorPos, ListExpr& errorInfo, bool& correct); static ListExpr Out(ListExpr typeInfo, Word value); static Word Create(const ListExpr typeInfo); static void Delete(const ListExpr typeInfo, Word& w); static bool Save(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value); static bool Open(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value); static void Close(const ListExpr typeInfo, Word& w); static Word Clone(const ListExpr typeInfo, const Word& w); static int SizeOfObj(); static bool TypeCheck(ListExpr type, ListExpr& errorInfo); static bool checkType(ListExpr t) {return listutils::isSymbol(t,BasicType());} double minSupp; unsigned int minSuppCnt; // \ceil{noTuples * minSupp} RelAgg *agg; }; struct SpadeLI { SpadeLI(VerticalDB *db, int mina, int maxa); ~SpadeLI(); Tuple* getNextResult(); VerticalDB *vdb; unsigned int minNoAtoms, maxNoAtoms; TupleType *tupleType; }; /* Used for operator ~splitter~ */ struct SplPlace { SplPlace() {} SplPlace(const Point& l, const std::string& c) : x(l.GetX()), y(l.GetY()) { assert(l.IsDefined()); strcpy(cat, c.c_str()); } ~SplPlace() {} bool operator==(const SplPlace& p) const { return AlmostEqual(x, p.x) && AlmostEqual(y, p.y) && cat == p.cat; } bool almostEqual(SplPlace& p, const double eps, const Geoid* geoid = 0) const{ Point p1(true, x, y), p2(true, p.x, p.y); std::string c1(cat), c2(p.cat); return (c1 == c2 && p1.Distance(p2, geoid) <= eps); } std::string toString() const { std::stringstream str; Point loc(true, x, y); str << "(" << loc << ", " << cat << ")"; return str.str(); } double x, y; char cat[48]; }; struct SplTSPlace : SplPlace { SplTSPlace() : SplPlace() {} SplTSPlace(const Instant& t, const Point& l, const std::string& c) : SplPlace(l, c) { assert(t.IsDefined()); instDbl = t.ToDouble(); } SplTSPlace(const SplTSPlace& src) { instDbl = src.instDbl; x = src.x; y = src.y; strcpy(cat, src.cat); } ~SplTSPlace() {} SplTSPlace operator=(const SplTSPlace& p) { SplTSPlace res(p); return res; } bool operator<(SplTSPlace& p) { return instDbl < p.instDbl; } std::string toString() const { std::stringstream str; Instant inst(instDbl); Point loc(true, x, y); str << "(" << inst << ", " << loc << ", " << cat << ")"; return str.str(); } bool fromList(ListExpr src) { if (!nl->HasLength(src, 3)) { return false; } Instant inst; if (!inst.ReadFrom(nl->First(src), false)) { return false; } instDbl = inst.ToDouble(); if (!nl->HasLength(nl->Second(src), 2)) { return false; } if (!listutils::isNumeric(nl->First(nl->Second(src))) || !listutils::isNumeric(nl->Second(nl->Second(src)))) { return false; } x = listutils::getNumValue(nl->First(nl->Second(src))); y = listutils::getNumValue(nl->Second(nl->Second(src))); if (!nl->IsAtom(nl->Third(src))) { return false; } if (nl->AtomType(nl->Third(src)) != StringType) { return false; } if (listutils::isSymbolUndefined(nl->Third(src))) { std::string undef("undefined"); strcpy(cat, undef.c_str()); } else { std::string strvalue = nl->StringValue(nl->Third(src)); strcpy(cat, strvalue.c_str()); } return true; } ListExpr toListExpr() const { std::string catstr(cat); catstr = catstr.substr(0, 48); Instant inst(instDbl); Point loc(true, x, y); return nl->ThreeElemList(inst.ToListExpr(false), nl->TwoElemList(nl->RealAtom(x), nl->RealAtom(y)), nl->StringAtom(catstr)); } double instDbl; }; struct SplGSequence { SplGSequence(const datetime::DateTime& dur) { deltaT = dur.ToDouble(); } ~SplGSequence() {} int size() const { return placeSeq.size(); } SplPlace get(const int i) const { assert (i >= 0 && (unsigned int)i < placeSeq.size()); return placeSeq[i]; } void append(const SplPlace& sp) { placeSeq.push_back(sp); } bool overlapsWith(const SplGSequence& s, const double tolerance, const Geoid* geoid = 0) const { if (size() != s.size()) { return false; } Point p1(true), p2(true); for (int i = 0; i < size(); i++) { p1.Set(get(i).x, get(i).y); p2.Set(s.get(i).x, s.get(i).y); if (p1.Distance(p2, geoid) > tolerance) { return false; } } return true; } double deltaT; // maximum transition time std::vector placeSeq; // consider only groups of size 1 }; class SplSemTraj : public Attribute { public: SplSemTraj() {} SplSemTraj(const int dummy) : Attribute(true), tsPlaces(1) {} SplSemTraj(const datetime::DateTime& t, const Point& l, const std::string& c) : Attribute(true), tsPlaces(1) { SplTSPlace p(t, l, c); this->clear(); this->append(p); } SplSemTraj(const SplSemTraj& sst) : Attribute(sst.IsDefined()), tsPlaces(sst.size()) { tsPlaces.copyFrom(sst.getTSPlaces()); } SplSemTraj& operator=(const SplSemTraj& src) { SetDefined(src.IsDefined()); tsPlaces.copyFrom(src.getTSPlaces()); return *this; } ~SplSemTraj() {} static const std::string BasicType() {return "splsemtraj";} static ListExpr Property(); bool ReadFrom(ListExpr LE, const ListExpr typeInfo); ListExpr ToListExpr(ListExpr typeInfo) const; int Compare(const Attribute* arg) const; bool Adjacent(const Attribute* arg) const; Attribute* Clone() const; size_t HashValue() const; void CopyFrom(const Attribute* arg); int NumOfFLOBs() const {return 1;} Flob* GetFLOB(const int i) {assert(i == 0); return &tsPlaces;} size_t Sizeof() const; static bool checkType(ListExpr t) {return listutils::isSymbol(t,BasicType());} static bool CheckKind(ListExpr type, ListExpr& errorInfo); void clear() { tsPlaces.clean(); } void append(const SplTSPlace& p) { tsPlaces.Append(p); } int size() const { return tsPlaces.Size(); } DbArray getTSPlaces() const { return tsPlaces; } bool isEmpty() const { return size() == 0; } SplTSPlace get(const int i) const { assert(i >= 0 && i < size()); SplTSPlace tsp; tsPlaces.Get(i, tsp); return tsp; } Instant firstInst() const { datetime::DateTime result(datetime::instanttype); result.SetDefined(false); if (!IsDefined()) { return result; } if (isEmpty()) { return result; } result.ReadFrom(get(0).instDbl); return result; } std::string toString() const { std::stringstream str; str << "( "; SplTSPlace tsp; for (int i = 0; i < size(); i++) { tsPlaces.Get(i, tsp); str << " " << tsp.toString(); if (i < size() - 1) { str << endl; } } str << ")" << endl; return str.str(); } void sort(); SplTSPlace last() const { assert(!isEmpty()); return get(size() - 1); } SplTSPlace first() const { assert(!isEmpty()); return get(0); } int find(const SplPlace& sp, const double tolerance,const Geoid* geoid) const; std::set getPositions(std::string label) const; void convertFromMPointMLabel(const temporalalgebra::MPoint& mp, const MLabel& ml, const double tolerance, const Geoid* geoid = 0); bool contains(const SplPlace& sp, const double deltaT, const double tolerance, const Geoid* geoid = 0) const; SplSemTraj postfix(const int pos) const; void addPostfixes(SplPlace sp, const double eps, const Geoid* geoid, std::vector& result) const; private: DbArray tsPlaces; }; struct SplPlaceSorter { SplPlaceSorter(double e, Geoid* g = 0) : eps(e), geoid(g) {} bool operator()(const SplPlace sp1, const SplPlace sp2) const { Point p1(true, sp1.x, sp1.y), p2(true, sp2.x, sp2.y); std::string c1(sp1.cat), c2(sp2.cat); // cout << " " << p1 << p2 << ", dist = " << p1.Distance(p2, geoid) << c1 // << " < " << c2 << " ? " << ((c1 < c2) || (c1 == c2 && p1 < p2 && // p1.Distance(p2, geoid) >= eps) ? "TRUE" : "FALSE") << endl; return (c1 < c2) || (c1 == c2 && p1 < p2 && p1.Distance(p2, geoid) >= eps); } double eps; Geoid* geoid; }; /* Class ~Splitter~, used for operator ~splitter~ */ class Splitter { public: Splitter(Word& s, const double sm, datetime::DateTime& mtt, const int mna, const double e, Geoid* g, const int attrNo); ~Splitter(); static TupleType* getTupleType(); void initialProjection(Word& s, const double sm, const int attrNo); void prefixSpan(SplSemTraj& prefix, std::vector pf); std::string postfixesToString(SplSemTraj pref, std::vector& pf); std::string freqItemsToString(std::map, SplPlaceSorter>& freqItems); void computeFrequentItems(Word& s, const int attrNo, const double sm, std::map, SplPlaceSorter >& freqItems); void computeLocalFreqItems(SplTSPlace tsp, std::vector pf, std::map, SplPlaceSorter >& freqItems); void addSnippets(SplSemTraj sst, const int freq); Tuple* next(); private: std::vector source; std::vector > result; unsigned int freqmin, pos; datetime::DateTime deltaT; int maxNoAtoms; double eps; Geoid *geoid; TupleType* tupleType; }; }