/* ---- This file is part of SECONDO. Copyright (C) 2004, University in Hagen, Department of Computer Science, Database Systems for New Applications. SECONDO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. SECONDO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with SECONDO; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ---- //paragraph [1] Title: [{\Large \bf \begin {center}] [\end {center}}] //[TOC] [\tableofcontents] Started November 2019, Fabio Vald\'{e}s */ #include "PatternMining.h" #include using namespace std; using namespace datetime; using namespace temporalalgebra; using namespace std::chrono; namespace stj { AggEntry::AggEntry() { occs.clear(); occsPos.clear(); noOccs = 0; duration.SetType(datetime::durationtype); duration.ReadFrom((int64_t)0); } AggEntry::AggEntry(const TupleId id, const temporalalgebra::SecInterval& iv, Rect& rect, const unsigned int noTuples) { occs.clear(); occsPos.clear(); occsPos.resize(noTuples + 1, UINT_MAX); Periods *per = new Periods(1); per->Add(iv); occsPos[id] = occs.size(); occs.push_back(make_tuple(id, per, rect)); noOccs = 0; duration.SetType(datetime::durationtype); duration.ReadFrom((int64_t)0); // durations are computed at the end } void AggEntry::clear() { for (auto it : occs) { get<1>(it)->DeleteIfAllowed(); } occs.clear(); occsPos.clear(); noOccs = 0; duration.SetType(datetime::durationtype); duration.ReadFrom((int64_t)0); } void AggEntry::deletePeriods() { for (auto it : occs) { (get<1>(it))->DeleteIfAllowed(); } } ListExpr AggEntry::toListExpr() { ListExpr occsList(nl->Empty()), occList(nl->Empty()); Word perWord, rectWord; TupleId id; if (occs.size() >= 1) { id = get<0>(occs[0]); perWord.addr = get<1>(occs[0]); rectWord.addr = &(get<2>(occs[0])); occsList = nl->OneElemList(nl->ThreeElemList(nl->IntAtom(id), OutRange(nl->Empty(), perWord), OutRectangle<2>(nl->Empty(), rectWord))); occList = occsList; } for (unsigned int i = 1; i < occs.size(); i++) { id = get<0>(occs[i]); perWord.addr = get<1>(occs[i]); rectWord.addr = &(get<2>(occs[i])); occList = nl->Append(occList, nl->ThreeElemList(nl->IntAtom(id), OutRange(nl->Empty(), perWord), OutRectangle<2>(nl->Empty(), rectWord))); } return nl->ThreeElemList(nl->IntAtom(noOccs), duration.ToListExpr(false), occsList); } unsigned int AggEntry::getNoOccs(const TupleId& id) const { return get<1>(occs[occsPos[id]])->GetNoComponents(); } void AggEntry::computeCommonTimeInterval(const set& commonTupleIds, SecInterval& iv) { iv.start.SetType(instanttype); iv.end.SetType(instanttype); iv.start.ToMaximum(); iv.end.ToMinimum(); Instant first(1.0), last(1.0); if (commonTupleIds.empty()) { // use all occurrences for (auto it : occs) { if (get<1>(it)->IsDefined()) { if (!get<1>(it)->IsEmpty()) { get<1>(it)->Minimum(first); get<1>(it)->Maximum(last); if (first < iv.start) { iv.start = first; } if (last > iv.end) { iv.end = last; } } } } } else { // use only entries occurring in the set for (auto it : commonTupleIds) { if (occsPos[it] != UINT_MAX) { Periods* per = get<1>(occs[occsPos[it]]); if (per->IsDefined()) { if (!per->IsEmpty()) { per->Minimum(first); per->Maximum(last); if (first < iv.start) { iv.start = first; } if (last > iv.end) { iv.end = last; } } } } } } if (iv.start.IsMinimum() || iv.end.IsMaximum()) { iv.SetDefined(false); } } void AggEntry::computeCommonRect(const SecInterval& iv, const set& commonTupleIds, Geoid *geoid, Rect &rect) { if (commonTupleIds.empty()) { // use all occurrences (for 1-patterns) if (occs.empty()) { return; } if (!get<2>(occs[0]).IsDefined()) { rect.SetDefined(false); return; } Rect tempRect = get<2>(occs[0]); for (unsigned int i = 1; i < occs.size(); i++) { if (!get<2>(occs[i]).IsDefined()) { rect.SetDefined(false); return; } tempRect = get<2>(occs[i]).Union(tempRect); } rect = tempRect; } else { set::iterator it = commonTupleIds.begin(); if (!get<2>(occs[occsPos[*it]]).IsDefined()) { rect.SetDefined(false); return; } Rect tempRect = get<2>(occs[occsPos[*it]]); while (it != commonTupleIds.end()) { if (!get<2>(occs[occsPos[*it]]).IsDefined()) { rect.SetDefined(false); return; } tempRect = get<2>(occs[occsPos[*it]]).Union(tempRect); it++; } rect = tempRect; } } void AggEntry::computeSemanticTimeSpec(const set& commonTupleIds, string& semanticTimeSpec) const { semanticTimeSpec.clear(); int month = 1; // {1, ..., 12} int weekday = 0; // {0, ..., 6} int daytime = 0; // {0, ..., 3} Instant first(1.0), last(1.0); Periods *per = 0; for (set::const_iterator it = commonTupleIds.begin(); it != commonTupleIds.end(); it++) { per = get<1>(occs[occsPos[*it]]); if (!per->IsDefined()) { semanticTimeSpec = ""; return; } if (per->IsEmpty()) { semanticTimeSpec = ""; return; } per->Minimum(first); per->Maximum(last); if (it == commonTupleIds.begin() && first.GetMonth() == last.GetMonth()) { month = first.GetMonth(); } else if (month != first.GetMonth() || first.GetMonth() != last.GetMonth()) { month = -1; } if (it == commonTupleIds.begin() && first.GetWeekday() == last.GetWeekday()) { weekday = first.GetWeekday(); } else if (weekday != first.GetWeekday() || first.GetWeekday() != last.GetWeekday()) { weekday = -1; } if (it == commonTupleIds.begin() && Tools::getDaytime(first.GetHour()) == Tools::getDaytime(last.GetHour())) { daytime = Tools::getDaytime(first.GetHour()); } else if (daytime != Tools::getDaytime(first.GetHour()) || Tools::getDaytime(first.GetMonth()) != Tools::getDaytime(last.GetMonth())) { daytime = -1; } if (month == -1 && weekday == -1 && daytime == -1) { semanticTimeSpec = ""; return; } } if (month > -1) { semanticTimeSpec = Tools::getMonthStr(month - 1); } if (weekday > -1) { semanticTimeSpec += (semanticTimeSpec.empty() ? "" : ", ") + Tools::getWeekdayStr(weekday); } if (daytime > -1) { semanticTimeSpec += (semanticTimeSpec.empty() ? "" : ", ") + Tools::getDaytimeStr(daytime); } } std::string AggEntry::print(const TupleId& id /* = 0 */) const { std::stringstream result; if (id == 0) { // print everything for (auto it : occs) { result << " TID " << get<0>(it) << ": " << get<1>(it)->GetNoComponents() << " occs: " << *(get<1>(it)) << get<2>(it) << endl; } } else { if (occsPos[id] == UINT_MAX) { // id not found result << " TID " << id << " not found" << endl; } else { result << " TID " << id << ": " << get<1>(occs[occsPos[id]])->GetNoComponents() << " occs: " << *(get<1>(occs[occsPos[id]])) << get<2>(occs[occsPos[id]]) << endl; } } return result.str(); } std::string AggEntry::print(const Rect& rect) const { std::stringstream result; if (!rect.IsDefined()) { return "_"; } result << "[" << rect.MinD(0) << " " << rect.MaxD(0) << " " << rect.MinD(1) << " " << rect.MaxD(1) << "]"; return result.str(); } /* Class ~RelAgg~, Constructors */ RelAgg::RelAgg() : noTuples(0), minNoAtoms(0), maxNoAtoms(0), minSupp(0.0), geoid(0), rel(0) {} RelAgg::RelAgg(RelAgg *ra) : noTuples(ra->noTuples), minNoAtoms(ra->minNoAtoms), maxNoAtoms(ra->maxNoAtoms), minSupp(ra->minSupp), geoid(ra->geoid), rel(ra->rel) { AggEntry dummy; entries.resize(ra->entries.size(), dummy); freqLabels.resize(ra->freqLabels.size()); labelPos.clear(); checkedSeqs.clear(); freqSets.clear(); nonfreqSets.clear(); } /* Class ~RelAgg~, Function ~clear~ Deletes the periods values */ void RelAgg::clear() { for (auto it : entriesMap) { it.second.clear(); } } void RelAgg::clearEntries() { for (auto it : entries) { it.deletePeriods(); } for (auto it : checkedSeqs) { for (auto it2 : it) { it2.clear(); } it.clear(); } checkedSeqs.clear(); for (auto it : freqSets) { for (auto it2 : it) { it2.clear(); } it.clear(); } freqSets.clear(); for (auto it : nonfreqSets) { for (auto it2 : it) { it2.clear(); } it.clear(); } nonfreqSets.clear(); } ListExpr RelAgg::entriesToListExpr() { ListExpr resultList, tempList; if (entries.empty() || labelPos.empty()) { return nl->SymbolAtom("Empty Container"); } auto it = labelPos.begin(); resultList = nl->OneElemList(nl->TwoElemList(nl->SymbolAtom(it->first), entries[it->second].toListExpr())); tempList = resultList; it++; while (it != labelPos.end()) { tempList = nl->Append(tempList, nl->TwoElemList(nl->SymbolAtom(it->first), entries[it->second].toListExpr())); it++; } return resultList; } bool RelAgg::saveToRecord(RelAgg *agg, SmiRecord& valueRecord, size_t& offset) { unsigned int noOccs, tid, labelLength, noComponents; double durD, start, end; string label; SecInterval iv(true); if (!valueRecord.Write(&agg->noTuples, sizeof(unsigned int), offset)) { return false; } offset += sizeof(unsigned int); unsigned long long int entriesSize = agg->computeEntriesSize(); cout << "size of entries is " << entriesSize << endl; if (!valueRecord.Write(&entriesSize, sizeof(unsigned long long int), offset)){ return false; } offset += sizeof(unsigned long long int); unsigned int noAggEntries = agg->entries.size(); if (!valueRecord.Write(&noAggEntries, sizeof(unsigned int), offset)) { return false; } offset += sizeof(unsigned int); char* entriesChars = new char[entriesSize]; size_t offsetEntries = 0; for (unsigned int i = 0; i < noAggEntries; i++) { memcpy(entriesChars + offsetEntries, &agg->entries[i].noOccs, sizeof(unsigned int)); offsetEntries += sizeof(unsigned int); durD = agg->entries[i].duration.ToDouble(); memcpy(entriesChars + offsetEntries, &durD, sizeof(double)); offsetEntries += sizeof(double); noOccs = agg->entries[i].occs.size(); memcpy(entriesChars + offsetEntries, &noOccs, sizeof(unsigned int)); offsetEntries += sizeof(unsigned int); for (auto occ : agg->entries[i].occs) { tid = get<0>(occ); memcpy(entriesChars + offsetEntries, &tid, sizeof(unsigned int)); offsetEntries += sizeof(unsigned int); noComponents = get<1>(occ)->GetNoComponents(); memcpy(entriesChars + offsetEntries, &noComponents, sizeof(unsigned int)); offsetEntries += sizeof(unsigned int); for (unsigned int j = 0; j < noComponents; j++) { get<1>(occ)->Get(j, iv); start = iv.start.ToDouble(); end = iv.end.ToDouble(); memcpy(entriesChars + offsetEntries, &start, sizeof(double)); offsetEntries += sizeof(double); memcpy(entriesChars + offsetEntries, &iv.lc, sizeof(bool)); offsetEntries += sizeof(bool); memcpy(entriesChars + offsetEntries, &end, sizeof(double)); offsetEntries += sizeof(double); memcpy(entriesChars + offsetEntries, &iv.rc, sizeof(bool)); offsetEntries += sizeof(bool); } double coords[] = {get<2>(occ).MinD(0), get<2>(occ).MinD(1), get<2>(occ).MaxD(0), get<2>(occ).MaxD(1)}; for (int c = 0; c < 4; c++) { memcpy(entriesChars + offsetEntries, &coords[c], sizeof(double)); offsetEntries += sizeof(double); } bool isdefined = get<2>(occ).IsDefined(); memcpy(entriesChars + offsetEntries, &isdefined, sizeof(bool)); offsetEntries += sizeof(bool); } for (auto occPos : agg->entries[i].occsPos) { tid = occPos; memcpy(entriesChars + offsetEntries, &tid, sizeof(unsigned int)); offsetEntries += sizeof(unsigned int); } } if (!valueRecord.Write(entriesChars, entriesSize, offset)) { return false; } offset += entriesSize; delete[] entriesChars; unsigned long long int freqLabelsSize = agg->computeFreqLabelsSize(); cout << "size of freqLabels is " << freqLabelsSize << endl; if (!valueRecord.Write(&freqLabelsSize, sizeof(unsigned long long int), offset)) { return false; } offset += sizeof(unsigned long long int); char* freqLabelsChars = new char[freqLabelsSize]; size_t offsetFreqLabels = 0; for (unsigned int i = 0; i < noAggEntries; i++) { label = agg->freqLabels[i]; labelLength = label.length(); memcpy(freqLabelsChars + offsetFreqLabels, &labelLength, sizeof(unsigned int)); offsetFreqLabels += sizeof(unsigned int); char labelArray[labelLength + 1]; strcpy(labelArray, label.c_str()); memcpy(freqLabelsChars + offsetFreqLabels, &labelArray, labelLength + 1); offsetFreqLabels += labelLength + 1; } if (!valueRecord.Write(freqLabelsChars, freqLabelsSize, offset)) { return false; } offset += freqLabelsSize; delete[] freqLabelsChars; return true; } bool RelAgg::readFromRecord(RelAgg *agg, SmiRecord& valueRecord, size_t& offset) { // auto measureStart = high_resolution_clock::now(); unsigned int noAggEntries, noOccs, noComponents, occPos, labelLength; double durD, start, end; Periods *per; SecInterval iv(true); iv.start.SetType(instanttype); iv.end.SetType(instanttype); // read ~noTuples~ if (!valueRecord.Read(&agg->noTuples, sizeof(unsigned int), offset)) { return false; } offset += sizeof(unsigned int); unsigned long long int entriesSize; if (!valueRecord.Read(&entriesSize, sizeof(unsigned long long int), offset)) { return false; } offset += sizeof(unsigned long long int); // cout << "size of entries is " << entriesSize << endl; char* entriesChars = new char[entriesSize]; // read ~entries~ if (!valueRecord.Read(&noAggEntries, sizeof(unsigned int), offset)) { return false; } offset += sizeof(unsigned int); if (!valueRecord.Read(entriesChars, entriesSize, offset)) { return false; } offset += entriesSize; size_t offsetEntries = 0; for (unsigned int i = 0; i < noAggEntries; i++) { AggEntry entry; memcpy(&entry.noOccs, entriesChars + offsetEntries, sizeof(unsigned int)); offsetEntries += sizeof(unsigned int); // cout << "noOccs = " << entry.noOccs << endl; memcpy(&durD, entriesChars + offsetEntries, sizeof(double)); offsetEntries += sizeof(double); // offsetEntries += sizeof(double); entry.duration.ReadFrom(durD); // cout << "duration is " << entry.duration << endl; memcpy(&noOccs, entriesChars + offsetEntries, sizeof(unsigned int)); offsetEntries += sizeof(unsigned int); TupleId tid; for (unsigned int j = 0; j < noOccs; j++) { memcpy(&tid, entriesChars + offsetEntries, sizeof(unsigned int)); offsetEntries += sizeof(unsigned int); memcpy(&noComponents, entriesChars + offsetEntries, sizeof(unsigned int)); offsetEntries += sizeof(unsigned int); per = new Periods(true); for (unsigned int k = 0; k < noComponents; k++) { memcpy(&start, entriesChars + offsetEntries, sizeof(double)); offsetEntries += sizeof(double); iv.start.ReadFrom(start); memcpy(&iv.lc, entriesChars + offsetEntries, sizeof(bool)); offsetEntries += sizeof(bool); memcpy(&end, entriesChars + offsetEntries, sizeof(double)); offsetEntries += sizeof(double); iv.end.ReadFrom(end); memcpy(&iv.rc, entriesChars + offsetEntries, sizeof(bool)); offsetEntries += sizeof(bool); per->MergeAdd(iv); } double *min = new double[2]; for (int c = 0; c < 2; c++) { memcpy(&min[c], entriesChars + offsetEntries, sizeof(double)); offsetEntries += sizeof(double); } double *max = new double[2]; for (int c = 0; c < 2; c++) { memcpy(&max[c], entriesChars + offsetEntries, sizeof(double)); offsetEntries += sizeof(double); } bool isdefined; memcpy(&isdefined, entriesChars + offsetEntries, sizeof(bool)); offsetEntries += sizeof(bool); Rect rect(isdefined, min, max); delete[] min; delete[] max; entry.occs.push_back(make_tuple(tid, per, rect)); } for (unsigned int j = 0; j <= agg->noTuples; j++) { memcpy(&occPos, entriesChars + offsetEntries, sizeof(unsigned int)); offsetEntries += sizeof(unsigned int); entry.occsPos.push_back(occPos); // cout << "... pushed back occPos " << occPos << endl; } agg->entries.push_back(entry); } delete[] entriesChars; // read ~freqLabels~ unsigned long long int freqLabelsSize = 0; if (!valueRecord.Read(&freqLabelsSize, sizeof(unsigned long long int), offset)) { return false; } offset += sizeof(unsigned long long int); char* freqLabelsChars = new char[freqLabelsSize]; size_t offsetFreqLabels = 0; if (!valueRecord.Read(freqLabelsChars, freqLabelsSize, offset)) { return false; } offset += freqLabelsSize; for (unsigned int i = 0; i < noAggEntries; i++) { memcpy(&labelLength, freqLabelsChars + offsetFreqLabels, sizeof(unsigned int)); offsetFreqLabels += sizeof(unsigned int); char labelArray[labelLength + 1]; memcpy(&labelArray, freqLabelsChars + offsetFreqLabels, labelLength + 1); offsetFreqLabels += labelLength + 1; string label(labelArray); agg->labelPos.insert(make_pair(label, agg->freqLabels.size())); agg->freqLabels.push_back(label); } delete[] freqLabelsChars; // auto measureStop = high_resolution_clock::now(); // double ms = // (double)(duration_cast(measureStop - measureStart).count()); // cout << "OPEN finished after " << ms << " ms" << endl; return true; } /* Class ~RelAgg~, function ~getLabelSeqFromMLabel~ */ void RelAgg::getLabelSeqFromMLabel(MLabel *ml, vector& result) { string label, lastLabel("undefined"); result.clear(); auto it = labelPos.begin(); for (int j = 0; j < ml->GetNoComponents(); j++) { ml->GetValue(j, label); it = labelPos.find(label); if (it != labelPos.end() && label != lastLabel) { // consider only freq lbs result.push_back(it->second); lastLabel = label; } } } /* Class ~RelAgg~, Function ~insertLabel~ Insert new labels into map structure, update structure for existing labels */ void RelAgg::insertLabelAndBbox(const std::string& label, const TupleId& id, const temporalalgebra::SecInterval& iv, Rect& rect) { // cout << "insert (" << label << ", " << id << ", " << iv << ")" << endl; if (label == "undefined") { return; } auto aggIt = entriesMap.find(label); if (aggIt == entriesMap.end()) { // new label AggEntry entry(id, iv, rect, rel->GetNoTuples()); entriesMap.insert(make_pair(label, entry)); } else { // label already present if (aggIt->second.occsPos[id] == UINT_MAX) { // new id for label Periods *per = new Periods(1); per->Add(iv); entriesMap[label].occsPos[id] = entriesMap[label].occs.size(); entriesMap[label].occs.push_back(make_tuple(id, per, rect)); } else { // id already present for label get<1>(entriesMap[label].occs[entriesMap[label].occsPos[id]])-> MergeAdd(iv); if (rect.IsDefined()) { get<2>(entriesMap[label].occs[entriesMap[label].occsPos[id]]) = get<2>(entriesMap[label].occs[entriesMap[label].occsPos[id]]).Union(rect); } } } entriesMap[label].noOccs++; entriesMap[label].duration += iv.end - iv.start; } /* Class ~RelAgg~, Function ~scanRelation~ Scan relation, call ~insert~ for all labels of mlabel attribute */ void RelAgg::scanRelation(Relation *r, const NewPair ap, Geoid *g) { rel = r; attrPos = ap; geoid = g; string label; SecInterval iv(true); noTuples = rel->GetNoTuples(); GenericRelationIterator* it = rel->MakeScan(); MLabel *ml = 0; MPoint *mp = 0; MPoint mpPart(true); Tuple *tuple = 0; Periods per(true); Rect rect(true); while ((tuple = it->GetNextTuple())) { ml = (MLabel*)(tuple->GetAttribute(attrPos.first)); mp = (MPoint*)(tuple->GetAttribute(attrPos.second)); for (int j = 0; j < ml->GetNoComponents(); j++) { ml->GetValue(j, label); ml->GetInterval(j, iv); per.Add(iv); mp->AtPeriods(per, mpPart); rect = mpPart.BoundingBoxSpatial(); per.Clear(); insertLabelAndBbox(label, tuple->GetTupleId(), iv, rect); } tuple->DeleteIfAllowed(); } delete it; } /* Class ~RelAgg~, Function ~filter~ Filter contents; keep only labels with supp >= minSupp */ void RelAgg::filter(const double ms, const size_t memSize) { minSupp = ms; double supp = 1.0; // scan ~entriesMap~; push entries for frequent labels into ~entries~ and // store every label and its entry's position inside ~entries~ in ~inv~ for (auto it : entriesMap) { supp = double(it.second.occs.size()) / noTuples; if (supp >= minSupp) { // cout << "INSERTED: \"" << it.first << "\", POS " << entries.size() // << " " << it.second.print() << endl; entries.push_back(it.second); labelPos.insert(make_pair(it.first, freqLabels.size())); freqLabels.push_back(it.first); } else { it.second.deletePeriods(); } } // for (unsigned int i = 0; i < freqLabels.size(); i++) { // cout << "<" << i << " : " << freqLabels[i] << "> "; // } // cout << endl << endl; // for (auto it : labelPos) { // cout << it.first << " |---> " << it.second << " "; // } // cout << endl; } /* Class ~RelAgg~, Function ~buildAtom~ Build a string representing a pattern atom from an entry of ~contents~ and compute its support */ bool RelAgg::buildAtom(unsigned int label, AggEntry entry, const set& commonTupleIds, string& atom) { SecInterval iv(true); string timeSpec, semanticTimeSpec; entry.computeCommonTimeInterval(commonTupleIds, iv); entry.computeSemanticTimeSpec(commonTupleIds, semanticTimeSpec); if (!semanticTimeSpec.empty()) { if (iv.start.IsDefined() && iv.end.IsDefined()) { timeSpec = "{" + iv.start.ToString() + "~" + iv.end.ToString() + ", " + semanticTimeSpec + "}"; } else { timeSpec = "{" + semanticTimeSpec + "}"; } } else { if (iv.start.IsDefined() && iv.end.IsDefined()) { timeSpec = iv.start.ToString() + "~" + iv.end.ToString(); } else { atom.clear(); return false; } } Rect rect(true); entry.computeCommonRect(iv, commonTupleIds, geoid, rect); atom = "(" + timeSpec + " \"" + freqLabels[label] + "\" " + entry.print(rect) + ")"; return true; } void RelAgg::subsetperm(vector source, int left, int index, vector& labelVec, set >& result) { if (left == 0) { do { result.insert(labelVec); } while (std::next_permutation(labelVec.begin(), labelVec.end())); return; } for (unsigned int i = index; i < source.size(); i++) { labelVec.push_back(source[i]); subsetperm(source, left - 1, i + 1, labelVec, result); labelVec.pop_back(); } } void RelAgg::subset(vector source, int left, int index, vector& labelVec, set >& result) { if (left == 0) { // if (nonfreqSets[labelVec.size()].find(labelVec) == // nonfreqSets[labelVec.size()].end()) { result.insert(labelVec); // } } for (unsigned int i = index; i < source.size(); i++) { labelVec.push_back(source[i]); subset(source, left - 1, i + 1, labelVec, result); labelVec.pop_back(); } } /* Class ~RelAgg~, Function ~retrieveLabelSets~ Computes all label combinations of size ~size~ for a tuple */ void RelAgg::retrieveLabelCombs(const unsigned int size, vector& source, set >& result) { result.clear(); vector labelVec; subsetperm(source, size, 0, labelVec, result); } /* Class ~RelAgg~, Function ~retrieveLabelSubsets~ */ void RelAgg::retrieveLabelSubsets(const unsigned int size, vector& source, set >& result) { result.clear(); vector labelVec; subset(source, size, 0, labelVec, result); } double RelAgg::getSupp(unsigned int label) { return double(entries[label].occs.size()) / noTuples; } /* Class ~RelAgg~, Function ~canIntersectionBeFrequent~ Computes the fraction of tuples in which all strings of ~labelSeq~ occur */ bool RelAgg::canLabelsBeFrequent(vector& labelSeq, set& intersection) { intersection.clear(); if (labelSeq.size() < 2) { cout << "sequence has only " << labelSeq.size() << " component(s)" << endl; return false; } set intersection_temp; vector > allOccs; allOccs.resize(labelSeq.size()); // retrieve occurrences for every label // cout << "check sequence " << print(labelSeq) << endl; for (unsigned int pos = 0; pos < labelSeq.size(); pos++) { for (auto occ : entries[labelSeq[pos]].occs) { allOccs[pos].insert(get<0>(occ)); } } // compute intersection of all id sets set_intersection(allOccs[0].begin(), allOccs[0].end(), allOccs[1].begin(), allOccs[1].end(), inserter(intersection, intersection.begin())); for (unsigned int pos = 2; pos < labelSeq.size(); pos++) { set_intersection(intersection.begin(), intersection.end(), allOccs[pos].begin(), allOccs[pos].end(), inserter(intersection_temp, intersection_temp.begin())); intersection = intersection_temp; intersection_temp.clear(); } // check support of intersection; if it is below ~minSupp~, there is no chance // for a frequent (k+1)-pattern // cout << "support of " << print(intersection) << " equals " // << double(intersection.size()) / noTuples << " ==> " // << (double(intersection.size()) / noTuples >= minSupp) << endl; return (double(intersection.size()) / noTuples >= minSupp); } /* Class ~RelAgg~, Function ~sequenceSupp~ Computes the support for a given sequence of labels */ double RelAgg::sequenceSupp(vector labelSeq, set intersection) { if (labelSeq.empty()) { return 0.0; } Instant start(instanttype), end(instanttype); int noOccurrences = 0; AggEntry *entry; for (auto id : intersection) { // try to find all labels start.ToMinimum(); end.ToMaximum(); bool sequenceFound = true; unsigned int pos = 0; while (sequenceFound && (pos < labelSeq.size())) { entry = &(entries[labelSeq[pos]]); if (entry->occsPos[id] < UINT_MAX) { get<1>(entry->occs[entry->occsPos[id]])->Maximum(end); if (start < end) { // label found, correct order // set start instant to begin of periods for current label get<1>(entry->occs[entry->occsPos[id]])->Minimum(start); } else { // label found, but not in expected order // cout << "WRONG ORDER: id " << id << ", \"" << labelSeq[pos] // << "\" NOT after \"" << labelSeq[pos-1] << "\"" << endl; sequenceFound = false; } } else { // label not found in corresponding tuple // cout << "NOT FOUND: id " << id << ", \"" << labelSeq[pos] << endl; sequenceFound = false; } pos++; } if (sequenceFound) { // cout << "SEQUENCE FOUND: id " << id << ", " << print(labelSeq) << endl; noOccurrences++; } } return double(noOccurrences) / noTuples; } /* Class ~RelAgg~, Function ~combineApriori~ Combine sets of $k$ frequent labels to sets of $k+1$ labels, similarly to Apriori algorithm, e.g. {a,b,c} combined with {a,b,d} yields {a,b,c,d}, or {a,b,c} combined with {e,c,b} yields { */ void RelAgg::combineApriori(set >& frequentLabelCombs, set >& labelCombs) { if (frequentLabelCombs.empty()) { return; } if ((frequentLabelCombs.begin())->empty()) { return; } // cout << "Frequent label combs for k = 2:" << endl; // for (auto it : frequentLabelCombs) { // cout << print(it) << endl; // } // cout << endl; unsigned int k = (frequentLabelCombs.begin())->size(); set >::iterator it2; set union_k_inc; for (set >::iterator it1 = frequentLabelCombs.begin(); it1 != frequentLabelCombs.end(); it1++) { it2 = it1; it2++; while (it2 != frequentLabelCombs.end()) { set_union(it1->begin(), it1->end(), it2->begin(), it2->end(), inserter(union_k_inc, union_k_inc.begin())); if (union_k_inc.size() == k+1) { // cout << print(*it1) << " and " << print(*it2) << " united to " // << print(union_k_inc) << endl; vector unionvec(union_k_inc.begin(), union_k_inc.end()); labelCombs.insert(labelCombs.end(), unionvec); } union_k_inc.clear(); it2++; } union_k_inc.clear(); } } void RelAgg::retrievePermutations(vector& labelComb, set >& labelPerms) { labelPerms.clear(); vector labels = labelComb; std::sort(labels.begin(), labels.end()); do { labelPerms.insert(labelPerms.end(), labels); } while (std::next_permutation(labels.begin(), labels.end())); } /* Class ~RelAgg~, Function ~derivePatterns~ Scan sorted representation in order to retrieve patterns */ void RelAgg::derivePatterns(const int mina, const int maxa) { minNoAtoms = mina; maxNoAtoms = maxa; // retrieve patterns with one atom, ~entries~ guaranteed to fulfill minSupp string pattern, atom; set commonTupleIds; double supp = 1.0; for (unsigned int label = 0; label < entries.size(); label++) { buildAtom(label, entries[label], commonTupleIds, atom); if (minNoAtoms == 1) { supp = double(entries[label].occs.size()) / noTuples; results.push_back(NewPair(atom, supp)); } } cout << freqLabels.size() << " frequent 1-patterns found" << endl; // retrieve patterns with two atoms if (maxNoAtoms < 2) { return; } set > labelCombs, frequentLabelCombs, labelPerms; SecInterval iv(true); // scan ~contents~; only atoms whose corresponding 1-patterns fulfill // ~minSupp~ can be part of a frequent 2-pattern vector frequentLabels; for (unsigned int label = 0; label < freqLabels.size(); label++) { frequentLabels.push_back(label); } retrieveLabelCombs(2, frequentLabels, labelPerms); // check all combinations for their support bool correct = false; for (auto labelPerm : labelPerms) { if (canLabelsBeFrequent(labelPerm, commonTupleIds)) { supp = sequenceSupp(labelPerm, commonTupleIds); if (supp >= minSupp) { if (minNoAtoms <= 2) { pattern.clear(); // build complete 2-pattern correct = buildAtom(labelPerm[0], entries[labelPerm[0]], commonTupleIds, atom); pattern += atom + " "; correct = correct && buildAtom(labelPerm[1], entries[labelPerm[1]], commonTupleIds, atom); pattern += atom; if (correct) { results.push_back(NewPair(pattern, supp)); } } frequentLabelCombs.insert(frequentLabelCombs.end(), labelPerm); } } } cout << frequentLabelCombs.size() << " frequent 2-patterns found" << endl; // retrieve patterns with three or more atoms unsigned int k = 3; while (k <= maxNoAtoms && !frequentLabelCombs.empty()) { // no frequent k-pat map, set > labelPermsWithCommonIds; labelCombs.clear(); combineApriori(frequentLabelCombs, labelCombs); frequentLabelCombs.clear(); for (auto labelComb : labelCombs) { if (canLabelsBeFrequent(labelComb, commonTupleIds)) { // cout << print(labelComb) << " can be frequent; occurs in " // << print(commonTupleIds) << endl; retrievePermutations(labelComb, labelPerms); for (auto labelPerm : labelPerms) { labelPermsWithCommonIds[labelPerm] = commonTupleIds; } } } for (auto it : labelPermsWithCommonIds) { // cout << print(it.first) << " occurs in " << print(it.second); supp = sequenceSupp(it.first, it.second); // cout << ", supp is " << supp << endl; if (supp >= minSupp) { if (k >= minNoAtoms) { pattern.clear(); correct = true; unsigned int pos = 0; while (correct && (pos < it.first.size())) { correct = correct && buildAtom(it.first[pos], entries[it.first[pos]], it.second, atom); pattern += atom + " "; pos++; } if (correct) { results.push_back(NewPair(pattern, supp)); } } frequentLabelCombs.insert(frequentLabelCombs.end(), it.first); // cout << "k = " << k << "; sequence " << print(it.first) // << " inserted" << endl; } } cout << frequentLabelCombs.size() << " frequent " << k << "-patterns found" << endl; k++; } std::sort(results.begin(), results.end(), comparePMResults()); } /* Class ~RelAgg~, Function ~computeEntriesSize~ Compute storage space in bytes for all entries: constant: noOccs, (noTuples + 1) * tid, noOccs, duration in every occ: tid, per->noComponents * (start, lc, end, rc), min(0,1), max(0,1), isdefined */ unsigned long long int RelAgg::computeEntriesSize() const { unsigned long long int constEntrySize = sizeof(unsigned int) + (noTuples + 1) * sizeof(unsigned int) + sizeof(unsigned int) + sizeof(double); unsigned long long int result = entries.size() * constEntrySize; for (auto entry : entries) { for (auto occ : entry.occs) { result += sizeof(unsigned int) + sizeof(unsigned int) + get<1>(occ)->GetNoComponents() * (2 * (sizeof(double) + sizeof(bool))) + 4 * sizeof(double) + sizeof(bool); } } return result; } /* Class ~RelAgg~, Function ~computeFreqLabelsSize~ Compute storage space in bytes for ~freqLabels~: size + size * (wordlength + word) */ unsigned long long int RelAgg::computeFreqLabelsSize() const { unsigned long long int result = 0; for (auto label : freqLabels) { result += label.size() + 1 + sizeof(unsigned int); } return result; } string RelAgg::print(const map& contents) const { stringstream result; for (auto it : contents) { result << "\"" << freqLabels[it.first] << "\" occurs " << it.second.noOccs << " times with a total duration of " << it.second.duration << endl << " " << it.second.print() << endl << "-----------------------------------------------" << endl; } return result.str(); } string RelAgg::print(const map >& frequentLabels) const { stringstream result; for (auto it : frequentLabels) { result << "TID " << it.first << ": "; for (auto it2 : it.second) { result << "\"" << freqLabels[it2] << "\" "; } result << endl; } return result.str(); } string RelAgg::print(const set >& labelCombs) const { stringstream result; result << "{" << endl; for (auto it : labelCombs) { result << " " << print(it); } result << "}" << endl; return result.str(); } string RelAgg::print(const unsigned int label /* = UINT_MAX */) { stringstream result; if (label == UINT_MAX) { // print everything for (unsigned int l = 0; l < freqLabels.size(); l++) { result << "\"" << freqLabels[label] << "\" :" << entries[label].print() << endl << "---------------------------------------------" << endl; } } else { if (entries[label].occs.empty()) { // label not found result << "Label \"" << freqLabels[label] << "\" not found" << endl; } else { result << "\"" << freqLabels[label] << "\" :" << entries[label].print() << endl; } } return result.str(); } /* Class ~FPNode~, Function ~toListExpr~ */ ListExpr FPNode::toListExpr(vector& freqLabels) const { ListExpr childrenList = nl->Empty(); if (!children.empty()) { childrenList = nl->OneElemList(nl->IntAtom(children[0])); } ListExpr childList = childrenList; for (unsigned int i = 1; i < children.size(); i++) { childList = nl->Append(childList, nl->IntAtom(children[i])); } string lb = (label < UINT_MAX ? freqLabels[label] : ""); return nl->FiveElemList(nl->SymbolAtom(lb), nl->IntAtom(frequency), childrenList, nl->IntAtom(nodeLink), nl->IntAtom(ancestor)); } /* Class ~FPTree~, Function ~isChildOf~ */ bool FPTree::isChildOf(unsigned int label, unsigned int pos, unsigned int& nextPos) { for (auto it : nodes[pos].children) { if (nodes[it].label == label) { nextPos = it; return true; } } nextPos = UINT_MAX; return false; } /* Class ~FPTree~, Function ~updateNodeLink~ */ void FPTree::updateNodeLink(unsigned int label, unsigned int targetPos) { map::iterator it = nodeLinks.find(label); if (it == nodeLinks.end()) { // no existing node link nodeLinks.insert(make_pair(label, targetPos)); } else { // node link for label exists unsigned int link = it->second; unsigned int currentPos = 0; while (link != 0) { // find end of node link currentPos = link; link = nodes[link].nodeLink; } nodes[currentPos].nodeLink = targetPos; } } /* Class ~FPTree~, Function ~insertLabelVector~ */ void FPTree::insertLabelVector(const vector& labelsOrdered, const unsigned int freq) { // cout << "insert: | "; // for (auto it : labelsOrdered) { // cout << it << " | "; // } // cout << endl; unsigned int nodePos(0), nextPos(0); for (auto label : labelsOrdered) { if (isChildOf(label, nodePos, nextPos)) { nodes[nextPos].frequency += freq; // cout << " \"" << label << "\" is child of \"" << nodes[nodePos].label // << "\", frequency = " << nodes[nextPos].frequency << endl; nodePos = nextPos; } else { FPNode node(label, freq, nodePos); nodes.push_back(node); nodes[nodePos].children.push_back(nodes.size() - 1); updateNodeLink(label, nodes.size() - 1); // cout << " new node for \"" << label << "\" at pos " // << nodes.size() - 1 << ", now child of " << nodePos << endl; nodePos = nodes.size() - 1; } } // cout << " ... SUCCESSFULLY inserted" << endl; } /* Class ~FPTree~, Function ~construct~ */ void FPTree::construct() { GenericRelationIterator* it = agg->rel->MakeScan(); MLabel *ml = 0; Tuple *tuple = 0; string label; set, compareLabelsWithSupp> labelsWithSupp; vector labelsOrdered; while ((tuple = it->GetNextTuple())) { labelsWithSupp.clear(); labelsOrdered.clear(); ml = (MLabel*)(tuple->GetAttribute(agg->attrPos.first)); for (int j = 0; j < ml->GetNoComponents(); j++) { ml->GetValue(j, label); unsigned int labelPos = agg->labelPos[label]; NewPair labelWithSupp(labelPos, agg->getSupp(labelPos)); if (labelWithSupp.second >= minSupp) { labelsWithSupp.insert(labelWithSupp); } } for (auto itl : labelsWithSupp) { labelsOrdered.push_back(itl.first); } insertLabelVector(labelsOrdered, 1); tuple->DeleteIfAllowed(); } delete it; } /* Class ~FPTree~, Function ~initialize~ */ void FPTree::initialize(const double ms, RelAgg *ra) { minSupp = ms; FPNode node(UINT_MAX, 0, 0); nodes.push_back(node); // create dummy node for root agg = ra; minSuppCnt = (unsigned int)std::ceil(minSupp * ra->noTuples); agg->checkedSeqs.resize(agg->maxNoAtoms + 1); } /* Class ~FPTree~, Function ~isOnePathTree~ */ bool FPTree::isOnePathTree() { for (unsigned int i = 0; i < getNoNodes(); i++) { if (i < getNoNodes() - 1) { // inner node if (nodes[i].children.size() != 1) { return false; // inner nodes have more or less than one child } if (nodes[i].children[0] != i+1) { return false; // wrong position of child } if (nodes[i].nodeLink != 0) { return false; // node link exists } } if (i == getNoNodes() - 1 && nodes[i].children.size() > 0) { return false; // leaf node has childs (ERROR) } } return true; } /* Class ~FPTree~, Function ~sortNodeLinks~ */ void FPTree::sortNodeLinks(vector& result) { result.clear(); set, compareLabelsWithSupp> labelsWithSupp; for (auto it : nodeLinks) { labelsWithSupp.insert(NewPair(it.first, agg->getSupp(it.first))); } for (set, compareLabelsWithSupp>::reverse_iterator it = labelsWithSupp.rbegin(); it != labelsWithSupp.rend(); ++it) { result.push_back(it->first); } } /* Class ~FPTree~, Function ~collectPatternsFromSeq~ */ void FPTree::collectPatternsFromSeq(vector& labelSeq, const unsigned int minNoAtoms, const unsigned int maxNoAtoms) { set > labelSubsets, labelPerms; unsigned int minSetSize = max(minNoAtoms, (unsigned int)2); set commonTupleIds; string atom, pattern; double supp; // find all subsets of label sequence, having a suitable number of elements unsigned int setSize = minSetSize; unsigned oldResultSize = 0; bool freqkPatFound = true; while (setSize <= maxNoAtoms && freqkPatFound) { agg->retrieveLabelSubsets(setSize, labelSeq, labelSubsets); for (auto subset : labelSubsets) { if (agg->nonfreqSets[setSize].find(subset) == agg->nonfreqSets[setSize].end()) { bool isSetFreq = agg->freqSets[setSize].find(subset) != agg->freqSets[setSize].end(); if (!isSetFreq) { isSetFreq = agg->canLabelsBeFrequent(subset, commonTupleIds); } if (isSetFreq) { agg->freqSets[setSize].insert(subset); labelPerms.clear(); do { // process all unchecked permutations of ~subset~ if (agg->checkedSeqs[setSize].find(subset) == agg->checkedSeqs[setSize].end()) { supp = agg->sequenceSupp(subset, commonTupleIds); if (supp >= minSupp) { for (unsigned int i = 0; i < subset.size(); i++) { if (!agg->buildAtom(subset[i], agg->entries[subset[i]], commonTupleIds, atom)) { cout << "Error in buildAtom for " << subset[i] << endl; return; } pattern += atom + " "; } agg->results.push_back(NewPair(pattern, supp)); pattern.clear(); } agg->checkedSeqs[setSize].insert(subset); } } while (std::next_permutation(subset.begin(), subset.end())); } else { agg->nonfreqSets[setSize].insert(subset); } } } freqkPatFound = (agg->results.size() > oldResultSize); oldResultSize = agg->results.size(); setSize++; } } /* Class ~FPTree~, Function ~computeReducedCondBase~ */ void FPTree::computeCondPatternBase(vector& labelSeq, vector, unsigned int> >& result) { result.clear(); NewPair, unsigned int> labelPathWithSuppCnt; unsigned int link = nodeLinks[*(labelSeq.rbegin())]; unsigned int anc, freq; while (link != 0) { anc = nodes[link].ancestor; freq = nodes[link].frequency; while (anc != 0) { // retrieve whole branch above ~label~ node labelPathWithSuppCnt.first.push_back(nodes[anc].label); anc = nodes[anc].ancestor; } labelPathWithSuppCnt.second = freq; if (!labelPathWithSuppCnt.first.empty()) { std::reverse(labelPathWithSuppCnt.first.begin(), labelPathWithSuppCnt.first.end()); result.push_back(labelPathWithSuppCnt); labelPathWithSuppCnt.first.clear(); } link = nodes[link].nodeLink; } } /* Class ~FPTree~, Function ~constructCondTree~ */ FPTree* FPTree::constructCondTree( vector, unsigned int> >& condPB) { if (condPB.empty()) { return 0; } FPTree *condFPTree = new FPTree(); // cout << "new tree created... " << endl; condFPTree->initialize(minSupp, agg); map labelsToSuppCnt; map::iterator mapIt; // build map: label --> suppCnt for (auto labelSeqWithSuppCnt : condPB) { for (auto label : labelSeqWithSuppCnt.first) { mapIt = labelsToSuppCnt.find(label); if (mapIt != labelsToSuppCnt.end()) { // label found; increase suppCnt mapIt->second += labelSeqWithSuppCnt.second; } else { // label not found; insert labelsToSuppCnt[label] = labelSeqWithSuppCnt.second; } } } // keep only labels having suppCnt >= minSuppCnt vector, unsigned int> > freqCondPB; vector labelSeq; for (auto labelSeqWithSuppCnt : condPB) { for (auto label : labelSeqWithSuppCnt.first) { if (labelsToSuppCnt[label] >= condFPTree->minSuppCnt) { labelSeq.push_back(label); } } if (!labelSeq.empty()) { freqCondPB.push_back(NewPair, unsigned int>(labelSeq, labelSeqWithSuppCnt.second)); labelSeq.clear(); } } if (freqCondPB.empty()) { delete condFPTree; return 0; } for (auto it : freqCondPB) { condFPTree->insertLabelVector(it.first, it.second); } // cout << " ... filled, " << condFPTree->getNoNodes() << " nodes" << endl; return condFPTree; } /* Class ~FPTree~, Function ~mineTree~ */ void FPTree::mineTree(vector& initLabels, const unsigned int minNoAtoms, const unsigned int maxNoAtoms) { if (!hasNodes()) { return; } if (isOnePathTree()) { // cout << " tree has ONE path, " << nodes.size() - 1 << " node(s) : <"; // for (auto it : initLabels) { // cout << agg->freqLabels[it] << ", "; // } // cout << "| "; set freqLabels(initLabels.begin(), initLabels.end()); for (unsigned int i = 1; i < nodes.size(); i++) { freqLabels.insert(nodes[i].label); // cout << agg->freqLabels[nodes[i].label] << ", "; } // cout << ">" << endl; vector labels(freqLabels.begin(), freqLabels.end()); collectPatternsFromSeq(labels, minNoAtoms, maxNoAtoms); // cout << " ... all patterns collected" << endl; } else { // tree has more than one path // cout << "tree has SEVERAL paths" << endl; vector labelsSortedByFrequency, labelSeq(initLabels.begin(), initLabels.end()); sortNodeLinks(labelsSortedByFrequency); vector, unsigned int> > condPatBase; for (auto label : labelsSortedByFrequency) { labelSeq.push_back(label); if (labelSeq.size() > 1) { collectPatternsFromSeq(labelSeq, minNoAtoms, maxNoAtoms); } computeCondPatternBase(labelSeq, condPatBase); // cout << "rPB for " << agg->print(labelSeq) << " has " // << condPatBase.size() << " elems: "; // for (auto it : condPatBase) { // cout << "-" << agg->print(it.first) << ",freq=" << it.second << endl; // } FPTree *condFPTree = constructCondTree(condPatBase); if (condFPTree != 0) { // Word fptval; // fptval.addr = condFPTree; // SecondoCatalog* sc = SecondoSystem::GetCatalog(); // cout << nl->ToString(FPTree::Out(sc->NumericType(nl->SymbolAtom( // BasicType())), fptval)) << endl; condFPTree->mineTree(labelSeq, minNoAtoms, maxNoAtoms); delete condFPTree; } labelSeq.pop_back(); } } } /* Class ~FPTree~, Function ~retrievePatterns~ */ void FPTree::retrievePatterns(const unsigned int minNoAtoms, const unsigned int maxNoAtoms) { if (minNoAtoms == 1) { string pattern, atom; set commonTupleIds; vector frequentLabels; double supp = 1.0; for (unsigned int l = 0; l < agg->entries.size(); l++) { // retrieve 1-pats agg->buildAtom(l, agg->entries[l], commonTupleIds, atom); supp = double(agg->entries[l].occs.size()) / agg->noTuples; agg->results.push_back(NewPair(atom, supp)); frequentLabels.push_back(l); } cout << frequentLabels.size() << " frequent 1-patterns found" << endl; } vector initialLabels; agg->checkedSeqs.resize(maxNoAtoms + 1); agg->freqSets.resize(maxNoAtoms + 1); agg->nonfreqSets.resize(maxNoAtoms + 1); mineTree(initialLabels, minNoAtoms, maxNoAtoms); std::sort(agg->results.begin(), agg->results.end(), comparePMResults()); } /* Class ~FPTree~, Function ~computeNodesSize~ Compute storage space in bytes for ~Nodes~: noNodes + noNodes * (label + freq + noChildren + noChildren * child + nodeLink + anc) */ unsigned long long int FPTree::computeNodesSize() const { unsigned long long int result = sizeof(unsigned int); for (auto node : nodes) { result += (5 + node.children.size()) * sizeof(unsigned int); } return result; } /* Class ~FPTree~, Function ~computeNodeLinksSize~ Compute storage space in bytes for ~NodeLinks~: noNL + noNL * (label + nodePos) */ unsigned long long int FPTree::computeNodeLinksSize() const { unsigned long long int result = sizeof(unsigned int) + nodeLinks.size() * 2 * sizeof(unsigned int); return result; } /* Class ~FPTree~, functions for secondo data type */ ListExpr FPTree::Property() { return (nl->TwoElemList( nl->FourElemList( nl->StringAtom("Signature"), nl->StringAtom("Example Type List"), nl->StringAtom("List Rep"), nl->StringAtom("Example List")), nl->FourElemList ( nl->StringAtom("-> SIMPLE"), nl->StringAtom(FPTree::BasicType()), nl->StringAtom("no list representation"), nl->StringAtom("")))); } Word FPTree::In(const ListExpr typeInfo, const ListExpr instance, const int errorPos, ListExpr& errorInfo, bool& correct) { correct = false; return SetWord(Address(0)); } ListExpr FPTree::getNodeLinksList(unsigned int label) { unsigned int link = nodeLinks[label]; ListExpr result = nl->OneElemList(nl->IntAtom(link)); ListExpr nodeLinkList = result; link = nodes[link].nodeLink; while (link != 0) { nodeLinkList = nl->Append(nodeLinkList, nl->IntAtom(link)); link = nodes[link].nodeLink; } return result; } ListExpr FPTree::Out(ListExpr typeInfo, Word value) { FPTree *tree = (FPTree*)value.addr; ListExpr nodesList(nl->Empty()), nodeList(nl->Empty()), nodeLinksList(nl->Empty()), nodeLinkList(nl->Empty()); ListExpr noTuplesList = nl->TwoElemList(nl->SymbolAtom("noTuples"), nl->IntAtom(tree->agg->noTuples)); ListExpr minSuppList = nl->TwoElemList(nl->SymbolAtom("minSupp"), nl->RealAtom(tree->minSupp)); if (tree->hasNodes()) { nodesList = nl->OneElemList(tree->nodes[0].toListExpr (tree->agg->freqLabels)); nodeList = nodesList; } for (unsigned int i = 1; i < tree->nodes.size(); i++) { nodeList = nl->Append(nodeList, tree->nodes[i].toListExpr (tree->agg->freqLabels)); } map::iterator it = tree->nodeLinks.begin(); if (tree->hasNodeLinks()) { nodeLinksList = nl->OneElemList(nl->TwoElemList(nl->SymbolAtom (tree->agg->freqLabels[it->first]), tree->getNodeLinksList(it->first))); nodeLinkList = nodeLinksList; } it++; while (it != tree->nodeLinks.end()) { nodeLinkList = nl->Append(nodeLinkList, nl->TwoElemList(nl->SymbolAtom(tree->agg->freqLabels[it->first]), tree->getNodeLinksList(it->first))); it++; } return nl->FourElemList(noTuplesList, minSuppList, nodesList, nodeLinksList); } Word FPTree::Create(const ListExpr typeInfo) { Word w; w.addr = (new FPTree()); return w; } void FPTree::Delete(const ListExpr typeInfo, Word& w) { FPTree *tree = (FPTree*)w.addr; delete tree; w.addr = 0; } bool FPTree::Save(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value) { FPTree *tree = (FPTree*)value.addr; // store minSupp if (!valueRecord.Write(&tree->minSupp, sizeof(double), offset)) { return false; } offset += sizeof(double); // store noNodes and nodes unsigned long long int nodesSize = tree->computeNodesSize(); cout << "size of nodes is " << nodesSize << endl; if (!valueRecord.Write(&nodesSize, sizeof(unsigned long long int), offset)) { return false; } offset += sizeof(unsigned long long int); char* nodesChars = new char[nodesSize]; unsigned int noNodes = tree->getNoNodes(); size_t offsetNodes = 0; memcpy(nodesChars + offsetNodes, &noNodes, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); string label; unsigned int numLabel, frequency, noChildren, child, nodeLink, ancestor; for (unsigned int i = 0; i < noNodes; i++) { // store nodes numLabel = tree->nodes[i].label; memcpy(nodesChars + offsetNodes, &numLabel, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); frequency = tree->nodes[i].frequency; memcpy(nodesChars + offsetNodes, &frequency, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); noChildren = tree->nodes[i].children.size(); memcpy(nodesChars + offsetNodes, &noChildren, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); for (unsigned int j = 0; j < tree->nodes[i].children.size(); j++) {//childr. child = tree->nodes[i].children[j]; memcpy(nodesChars + offsetNodes, &child, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); } nodeLink = tree->nodes[i].nodeLink; memcpy(nodesChars + offsetNodes, &nodeLink, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); ancestor = tree->nodes[i].ancestor; memcpy(nodesChars + offsetNodes, &ancestor, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); } if (!valueRecord.Write(nodesChars, nodesSize, offset)) { return false; } offset += nodesSize; delete[] nodesChars; // store noNodeLinks unsigned long long int nodeLinksSize = tree->computeNodeLinksSize(); cout << "size of nodelinks is " << nodeLinksSize << endl; if (!valueRecord.Write(&nodeLinksSize, sizeof(unsigned long long int), offset)) { return false; } offset += sizeof(unsigned long long int); char* nodeLinksChars = new char[nodeLinksSize]; unsigned int noNodeLinks = tree->getNoNodeLinks(); size_t offsetNodeLinks = 0; memcpy(nodeLinksChars + offsetNodeLinks, &noNodeLinks, sizeof(unsigned int)); offsetNodeLinks += sizeof(unsigned int); // store nodeLinks for (map::iterator it = tree->nodeLinks.begin(); it != tree->nodeLinks.end(); it++) { // store nodeLinks numLabel = it->first; memcpy(nodeLinksChars + offsetNodeLinks, &numLabel, sizeof(unsigned int)); offsetNodeLinks += sizeof(unsigned int); nodeLink = it->second; memcpy(nodeLinksChars + offsetNodeLinks, &nodeLink, sizeof(unsigned int)); offsetNodeLinks += sizeof(unsigned int); } if (!valueRecord.Write(nodeLinksChars, nodeLinksSize, offset)) { return false; } offset += nodeLinksSize; delete[] nodeLinksChars; // store ~noTuples~, ~entries~ and ~freqLabels~ from relAgg if (RelAgg::saveToRecord(tree->agg, valueRecord, offset)) { return false; } return true; } bool FPTree::Open(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value) { FPTree *tree = new FPTree(); // read minSupp if (!valueRecord.Read(&tree->minSupp, sizeof(double), offset)) { return false; } offset += sizeof(double); unsigned int numLabel, noNodes, frequency, noChildren, child, nodeLink, ancestor, noNodeLinks; // read nodes unsigned long long int nodesSize; if (!valueRecord.Read(&nodesSize, sizeof(unsigned long long int), offset)) { return false; } offset += sizeof(unsigned long long int); // cout << "size of NODES is " << nodesSize << endl; char* nodesChars = new char[nodesSize]; size_t offsetNodes = 0; if (!valueRecord.Read(nodesChars, nodesSize, offset)) { return false; } offset += nodesSize; memcpy(&noNodes, nodesChars + offsetNodes, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); for (unsigned int i = 0; i < noNodes; i++) { // read nodes memcpy(&numLabel, nodesChars + offsetNodes, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); memcpy(&frequency, nodesChars + offsetNodes, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); memcpy(&noChildren, nodesChars + offsetNodes, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); vector children; for (unsigned int j = 0; j < noChildren; j++) { memcpy(&child, nodesChars + offsetNodes, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); children.push_back(child); } memcpy(&nodeLink, nodesChars + offsetNodes, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); memcpy(&ancestor, nodesChars + offsetNodes, sizeof(unsigned int)); offsetNodes += sizeof(unsigned int); // cout << "create node: " << numLabel << ", " << frequency << ", " // << children.size() << ", " << nodeLink << ", " << ancestor << endl; FPNode node(numLabel, frequency, children, nodeLink, ancestor); tree->nodes.push_back(node); } delete[] nodesChars; // read nodeLinks unsigned long long int nodeLinksSize = 0; if (!valueRecord.Read(&nodeLinksSize, sizeof(unsigned long long int), offset)) { return false; } offset += sizeof(unsigned long long int); char* nodeLinksChars = new char[nodeLinksSize]; if (!valueRecord.Read(nodeLinksChars, nodeLinksSize, offset)) { return false; } offset += nodeLinksSize; size_t offsetNodeLinks = 0; memcpy(&noNodeLinks, nodeLinksChars, sizeof(unsigned int)); offsetNodeLinks += sizeof(unsigned int); for (unsigned int i = 0; i < noNodeLinks; i++) { memcpy(&numLabel, nodeLinksChars + offsetNodeLinks, sizeof(unsigned int)); offsetNodeLinks += sizeof(unsigned int); memcpy(&nodeLink, nodeLinksChars + offsetNodeLinks, sizeof(unsigned int)); offsetNodeLinks += sizeof(unsigned int); // cout << "create nodeLink: " << numLabel << " --> " << nodeLink << endl; tree->nodeLinks.insert(tree->nodeLinks.begin(), make_pair(numLabel, nodeLink)); } delete[] nodeLinksChars; tree->agg = new RelAgg(); if (!RelAgg::readFromRecord(tree->agg, valueRecord, offset)) { return false; } value.setAddr(tree); return true; } void FPTree::Close(const ListExpr typeInfo, Word& w) { FPTree *tree = (FPTree*)w.addr; delete tree; w.addr = 0; } Word FPTree::Clone(const ListExpr typeInfo, const Word& w) { FPTree *tree = (FPTree*)w.addr; Word res; res.addr = new FPTree(*tree); return res; } int FPTree::SizeOfObj() { return sizeof(FPTree); } bool FPTree::TypeCheck(ListExpr type, ListExpr& errorInfo) { return nl->IsEqual(type, BasicType()); } /* Type constructor for secondo type ~fptree~ */ TypeConstructor fptreeTC( FPTree::BasicType(), FPTree::Property, FPTree::Out, FPTree::In, 0, 0, FPTree::Create, FPTree::Delete, FPTree::Open, FPTree::Save, FPTree::Close, FPTree::Clone, 0, FPTree::SizeOfObj, FPTree::TypeCheck); /* Class ~ProjectedDB~, function ~clear~ */ void ProjectedDB::clear() { } /* Class ~ProjectedDB~, function ~initialize~ */ void ProjectedDB::initialize(const double ms, RelAgg *ra) { minSupp = ms; agg = ra; minSuppCnt = (unsigned int)std::ceil(minSupp * ra->noTuples); agg->checkedSeqs.resize(agg->maxNoAtoms + 1); projections.resize(agg->freqLabels.size()); } /* Class ~ProjectedDB~, function ~addProjections~ */ void ProjectedDB::addProjections(vector& labelSeq, unsigned int label, const vector& prefix /* = vector() */) { if (labelSeq.empty()) { return; } // cout << "compute proj for seq " << agg->print(labelSeq) << endl; vector projPresent; vector reducedSeq; projPresent.resize(agg->freqLabels.size(), false); if (prefix.empty()) { for (unsigned int pos = 0; pos < labelSeq.size() - 1; pos++) { if (label == UINT_MAX || label == labelSeq[pos]) { if (!projPresent[labelSeq[pos]]) { reducedSeq.assign(labelSeq.begin() + pos + 1, labelSeq.end()); // cout << " projection(<" << labelSeq[pos] << ">) : " // << agg->print(reducedSeq) << endl; projections[labelSeq[pos]].push_back(reducedSeq); projPresent[labelSeq[pos]] = true; } } } } else { for (unsigned int pos = 0; pos < labelSeq.size() - 1; pos++) { if (label == UINT_MAX || label == labelSeq[pos]) { if (!projPresent[labelSeq[pos]]) { for (unsigned int i = pos + 1; i < labelSeq.size(); i++) { if (std::find(prefix.begin(), prefix.end(), labelSeq[i]) == prefix.end()) { reducedSeq.push_back(labelSeq[i]); } // cout << " projection(<" << labelSeq[pos] << ">) : " // << agg->print(reducedSeq) << endl; } if (!reducedSeq.empty()) { projections[labelSeq[pos]].push_back(reducedSeq); projPresent[labelSeq[pos]] = true; reducedSeq.clear(); } } } } } } /* Class ~ProjectedDB~, function ~computeSMatrix~ */ void ProjectedDB::computeSMatrix(vector& freqLabels, vector >& fPos) { // cout << "computeSMatrix - freq Labels: " << agg->print(freqLabels) << endl; smatrix.init(freqLabels.size()); vector counted; counted.resize(freqLabels.size(), false); for (unsigned int i = 0; i < freqLabels.size(); i++) { freqLabelPos.push_back(freqLabels[i]); if (!projections[i].empty()) { for (auto seq : projections[i]) { for (auto label : seq) { if (!counted[label] && label != freqLabels[i]) { smatrix.increment(i, label); if (smatrix[i][label] == minSuppCnt) { fPos.push_back(NewPair(i, label)); } counted[label] = true; } } counted.assign(freqLabels.size(), false); } } } } /* Class ~ProjectedDB~, function ~construct~ */ void ProjectedDB::construct() { // cout << "frequent labels: " << agg->print(agg->freqLabels) << endl; GenericRelationIterator* it = agg->rel->MakeScan(); MLabel *ml = 0; Tuple *tuple = 0; vector labelSeq; vector > projection; while ((tuple = it->GetNextTuple())) { ml = (MLabel*)(tuple->GetAttribute(agg->attrPos.first)); agg->getLabelSeqFromMLabel(ml, labelSeq); addProjections(labelSeq, UINT_MAX); // add all available projections tuple->DeleteIfAllowed(); } delete it; for (unsigned int i = 0; i < projections.size(); i++) { if (projections[i].size() < minSuppCnt) { // cout << " remove " << projections[i].size() << " projs for label " // << agg->freqLabels[i] << endl; projections[i].clear(); } } vector freqLabels; for (unsigned int i = 0; i < agg->freqLabels.size(); i++) { freqLabels.push_back(i); } computeSMatrix(freqLabels, freqMatrixPos); // cout << "flPos: " << agg->print(freqLabelPos) << endl << "SMatrix:" << endl // << smatrix.print() << endl; } /* Class ~ProjectedDB~, Function ~minePDB~ */ void ProjectedDB::minePDB(vector& prefix, string& patPrefix, unsigned int pos, const unsigned int minNoAtoms, const unsigned int maxNoAtoms) { // cout << "minePDB for prefix " << agg->print(prefix) << "; " // << projections[pos].size() << " projs for pos " << pos << endl; if (prefix.size() + 1 > maxNoAtoms || projections[pos].size() < minSuppCnt) { return; } // compute frequent labels vector labelCounter; set newFreqLabels; labelCounter.resize(agg->freqLabels.size(), 0); vector hasBeenCounted; hasBeenCounted.resize(agg->freqLabels.size(), false); for (auto seq : projections[pos]) { for (auto label : seq) { if (!hasBeenCounted[label]) { labelCounter[label]++; hasBeenCounted[label] = true; } } hasBeenCounted.assign(hasBeenCounted.size(), false); } vector > reducedSeqs; vector reducedSeq; unsigned lastLabel = UINT_MAX; // compute reduced sequences for (auto seq : projections[pos]) { for (auto label : seq) { if (labelCounter[label] >= minSuppCnt && label != lastLabel) { reducedSeq.push_back(label); lastLabel = label; newFreqLabels.insert(label); } } if (!reducedSeq.empty()) { reducedSeqs.push_back(reducedSeq); reducedSeq.clear(); } lastLabel = UINT_MAX; } // cout << " reduced projections for prefix " << agg->print(prefix) << ": "; // for (auto seq : reducedSeqs) { // cout << agg->print(seq) << ", "; // } // cout << "; frequent labels: " << agg->print(newFreqLabels) << endl; // build atoms for prefix set commonTupleIds; string atom, patPrefixExt; ProjectedDB *pdb = new ProjectedDB(minSupp, minSuppCnt, agg, agg->freqLabels.size()); // cout << "### NEW pdb CREATED for prefix " << agg->print(prefix) << endl; // for (unsigned int i = 0; i < prefix.size(); i++) { // agg->buildAtom(prefix[i], agg->entries[prefix[i]], commonTupleIds, atom); // patPrefix += atom + " "; // } // complete patterns, recursion with extended prefixes, TODO: use intersection for (auto flabel : newFreqLabels) { if (flabel != prefix[prefix.size() - 1]) { // cout << " result sequence " << agg->print(prefix) << ", " << flabel // << " found" << endl; agg->buildAtom(flabel, agg->entries[flabel], commonTupleIds, atom); patPrefixExt = patPrefix + " " + atom; if (prefix.size() + 1 >= minNoAtoms) { agg->results.push_back(NewPair(patPrefixExt, (double)labelCounter[flabel] / agg->noTuples)); } atom.clear(); if (prefix.size() + 1 < maxNoAtoms) { prefix.push_back(flabel); for (auto seq : reducedSeqs) { if (!projections[flabel].empty()) { // Optimization 1 from PS paper pdb->addProjections(seq, flabel); } // cout << " //// projections for seq " << agg->print(seq) // << " added; projections[" << flabel << "] has " // << pdb->projections[flabel].size() << " elements, " // << reducedSeqs.size() << " rseqs total" << endl; } for (unsigned int i = 0; i < pdb->projections.size(); i++) { if (pdb->projections[i].size() < pdb->minSuppCnt) { // cout << " remove " << pdb->projections[i].size() // << " projs for label " << pdb->agg->freqLabels[i] << endl; pdb->projections[i].clear(); } // else { TODO: restore this path if this function is invoked // pdb->projPos.push_back(i); // } } pdb->minePDB(prefix, patPrefixExt, flabel, minNoAtoms, maxNoAtoms); prefix.pop_back(); } } } delete pdb; } /* Class ~ProjectedDB~, function ~retrievePatterns~ compute projections, report frequent items, build S-matrix, report freq cells */ void ProjectedDB::minePDBSMatrix(vector& prefix, string& patPrefix, unsigned int minNoAtoms, unsigned int maxNoAtoms) { unsigned int secondLast(prefix[prefix.size() - 2]), last(prefix[prefix.size() - 1]); vector labelCounter; // cout << "FreqLabelPos: " << agg->print(freqLabelPos) << endl; labelCounter.resize(agg->freqLabels.size(), 0); vector counted; counted.resize(labelCounter.size(), false); string atom, pattern; vector atoms; set commonTupleIds; vector freqLabels; ProjectedDB *pdb = new ProjectedDB(minSupp, minSuppCnt, agg, projections.size()); for (auto seq : projections[secondLast]) { pdb->addProjections(seq, last, prefix); } // cout << "Prefix " << agg->print(prefix) << " : " << endl; // for (unsigned int i = 0; i < pdb->projections.size(); i++) { // cout << " pos " << i << " : " // << nl->ToString(projToListExpr(pdb->projections[i])) << endl; // } for (auto seq : pdb->projections[last]) { // cout << " count sequence " << agg->print(seq) << endl; for (auto label : seq) { if (!counted[label]) { labelCounter[label]++; counted[label] = true; } } counted.assign(counted.size(), false); } // cout << "labelCounter: " << agg->print(labelCounter) << endl // << "freqLabelPos: " << agg->print(freqLabelPos) << endl; // build patterns for frequently occurring labels in projections map matrixPos; for (unsigned int label = 0; label < labelCounter.size(); label++) { if (labelCounter[label] >= minSuppCnt) { freqLabels.push_back(label); matrixPos[label] = freqLabels.size() - 1; pdb->freqLabelPos.push_back(label); agg->buildAtom(label, agg->entries[label], commonTupleIds, atom); atoms.push_back(atom); if (prefix.size() + 1 >= minNoAtoms) { // cout << " RESULT " << agg->print(prefix) << " " << label << endl; agg->results.push_back(NewPair(patPrefix + " " + atom, double(labelCounter[label]) / agg->noTuples)); } } } if (prefix.size() + 2 > maxNoAtoms) { return; } // build S-matrix pdb->smatrix.init(pdb->freqLabelPos.size()); // cout << "build S-matrix for freqLabels " << agg->print(freqLabels) << endl; for (auto seq : pdb->projections[last]) { for (unsigned int i = 0; i < seq.size(); i++) { if (labelCounter[seq[i]] >= minSuppCnt) { for (unsigned int j = i + 1; j < seq.size(); j++) { if (labelCounter[seq[j]] >= minSuppCnt) { pdb->smatrix.increment(matrixPos[seq[i]], matrixPos[seq[j]]); if (pdb->smatrix(matrixPos[seq[i]], matrixPos[seq[j]]) == minSuppCnt) { pdb->freqMatrixPos.push_back(NewPair (matrixPos[seq[i]], matrixPos[seq[j]])); } } } } } } // cout << pdb->smatrix.print() << " freqMatrixPos = {"; // for (auto it : pdb->freqMatrixPos) { // cout << "(" << it.first << ", " << it.second << ") "; // } // cout << "}" << endl; // prepare projections for longer prefix if (prefix.size() + 1 <= maxNoAtoms) { for (auto seq : pdb->projections[last]) { pdb->addProjections(seq, UINT_MAX, prefix); } } pdb->projections[last].clear(); // for (unsigned int i = 0; i < pdb->projections.size(); i++) { // cout << " ### pos " << i << " : " // << nl->ToString(projToListExpr(pdb->projections[i])) << endl; // } // report frequent matrix positions for (auto fPos : pdb->freqMatrixPos) { if (std::find(prefix.begin(), prefix.end(), pdb->freqLabelPos[fPos.first]) == prefix.end() && std::find(prefix.begin(), prefix.end(), pdb->freqLabelPos[fPos.second]) == prefix.end()) { prefix.push_back(pdb->freqLabelPos[fPos.first]); prefix.push_back(pdb->freqLabelPos[fPos.second]); pattern = patPrefix + " " + atoms[fPos.first] + " " + atoms[fPos.second]; if (prefix.size() >= minNoAtoms) { agg->results.push_back(NewPair(pattern, (double)pdb->smatrix(fPos.first, fPos.second) / agg->noTuples)); } if (prefix.size() + 1 <= maxNoAtoms) { pdb->minePDBSMatrix(prefix, pattern, minNoAtoms, maxNoAtoms); } prefix.pop_back(); prefix.pop_back(); } } delete pdb; } /* Class ~ProjectedDB~, function ~retrievePatterns~ */ void ProjectedDB::retrievePatterns(const unsigned int minNoAtoms, const unsigned int maxNoAtoms) { vector atoms; string atom, patPrefix; atoms.resize(agg->freqLabels.size(), ""); set commonTupleIds; double supp = 1.0; vector prefix; for (unsigned int l = 0; l < agg->freqLabels.size(); l++) { agg->buildAtom(l, agg->entries[l], commonTupleIds, atoms[l]); if (minNoAtoms == 1) { supp = double(agg->entries[l].occs.size()) / agg->noTuples; agg->results.push_back(NewPair(atoms[l], supp)); } } cout << agg->results.size() << " frequent 1-patterns found" << endl; if (maxNoAtoms >= 2) { for (auto fPos : freqMatrixPos) { agg->buildAtom(freqLabelPos[fPos.second], agg->entries[freqLabelPos[fPos.second]], commonTupleIds, atom); patPrefix = atoms[freqLabelPos[fPos.first]] + " " + atom; if (minNoAtoms <= 2) { agg->results.push_back(NewPair(patPrefix, (double)smatrix(fPos.first, fPos.second) / agg->noTuples)); } if (maxNoAtoms >= 3) { prefix.push_back(fPos.first); prefix.push_back(fPos.second); minePDBSMatrix(prefix, patPrefix, minNoAtoms, maxNoAtoms); prefix.clear(); } } } // vector prefix; // for (unsigned int i = 0; i < projPos.size(); i++) { // prefix.push_back(projPos[i]); // minePDB(prefix, atoms[projPos[i]], projPos[i], minNoAtoms, maxNoAtoms); // prefix.pop_back(); // } std::sort(agg->results.begin(), agg->results.end(), comparePMResults()); } /* Class ~ProjectedDB~, Function ~computeProjSize~ Compute storage space in bytes for all projections: */ unsigned long long int ProjectedDB::computeProjSize() const { unsigned long long int result = sizeof(unsigned int); for (unsigned int i = 0; i < projections.size(); i++) { result += sizeof(unsigned int); for (unsigned int j = 0; j < projections[i].size(); j++) { result += sizeof(unsigned int) * (projections[i][j].size() + 1); } } return result; } /* Class ~ProjectedDB~, Function ~computeMatrixSize~ Compute storage space in bytes for smatrix: */ unsigned long long int ProjectedDB::computeMatrixSize() const { unsigned long long int result = sizeof(unsigned int) * (smatrix.size * smatrix.size + 1); return result; } /* Class ~ProjectedDB~, Function ~computeFreqLabelPosSize~ Compute storage space in bytes for freqLabelPos: */ unsigned long long int ProjectedDB::computeFreqLabelPosSize() const { unsigned long long int result = sizeof(unsigned int)*(freqLabelPos.size()+1); return result; } /* Class ~ProjectedDB~, Function ~computeFreqMatrixPosSize~ Compute storage space in bytes for freqMatrixPos: */ unsigned long long int ProjectedDB::computeFreqMatrixPosSize() const { unsigned long long int result = sizeof(unsigned int) * (2 * freqMatrixPos.size() + 1); return result; } /* Class ~ProjectedDB~, functions for secondo data type */ ListExpr ProjectedDB::seqToListExpr(vector& seq) { ListExpr seqList(nl->Empty()), seqListTemp; if (!seq.empty()) { seqList = nl->OneElemList(nl->IntAtom(seq[0])); seqListTemp = seqList; } for (unsigned int i = 1; i < seq.size(); i++) { seqListTemp = nl->Append(seqListTemp, nl->IntAtom(seq[i])); } return seqList; } ListExpr ProjectedDB::projToListExpr(vector >& proj) { ListExpr projList(nl->Empty()), projListTemp; if (!proj.empty()) { projList = nl->OneElemList(seqToListExpr(proj[0])); projListTemp = projList; } for (unsigned int i = 1; i < proj.size(); i++) { projListTemp = nl->Append(projListTemp, seqToListExpr(proj[i])); } return projList; } ListExpr ProjectedDB::Out(ListExpr typeInfo, Word value) { ProjectedDB *pdb = (ProjectedDB*)value.addr; ListExpr noTuplesList = nl->TwoElemList(nl->SymbolAtom("noTuples"), nl->IntAtom(pdb->agg->noTuples)); ListExpr minSuppList = nl->TwoElemList(nl->SymbolAtom("minSupp"), nl->RealAtom(pdb->minSupp)); ListExpr projList(nl->Empty()), projsList(nl->Empty()); if (!pdb->projections.empty()) { projsList = nl->OneElemList(nl->TwoElemList(nl->SymbolAtom( pdb->agg->freqLabels[0]), pdb->projToListExpr(pdb->projections[0]))); projList = projsList; } for (unsigned int i = 1; i < pdb->projections.size(); i++) { projList = nl->Append(projList, nl->TwoElemList(nl->TextAtom(pdb->agg->freqLabels[i]), pdb->projToListExpr(pdb->projections[i]))); } return nl->ThreeElemList(noTuplesList, minSuppList, projsList); } ListExpr ProjectedDB::Property() { return (nl->TwoElemList( nl->FourElemList( nl->StringAtom("Signature"), nl->StringAtom("Example Type List"), nl->StringAtom("List Rep"), nl->StringAtom("Example List")), nl->FourElemList ( nl->StringAtom("-> SIMPLE"), nl->StringAtom(ProjectedDB::BasicType()), nl->StringAtom("no list representation"), nl->StringAtom("")))); } Word ProjectedDB::In(const ListExpr typeInfo, const ListExpr instance, const int errorPos, ListExpr& errorInfo, bool& correct) { correct = false; return SetWord(Address(0)); } Word ProjectedDB::Create(const ListExpr typeInfo) { Word w; w.addr = (new ProjectedDB()); return w; } void ProjectedDB::Delete(const ListExpr typeInfo, Word& w) { ProjectedDB *pdb = (ProjectedDB*)w.addr; delete pdb; w.addr = 0; } bool ProjectedDB::Save(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value) { ProjectedDB *pdb = (ProjectedDB*)value.addr; // store minSupp if (!valueRecord.Write(&pdb->minSupp, sizeof(double), offset)) { return false; } offset += sizeof(double); // store agg if (!RelAgg::saveToRecord(pdb->agg, valueRecord, offset)) { return false; } // prepare storing everything else in one record unsigned long long int recordSize = pdb->computeProjSize() + pdb->computeMatrixSize() + pdb->computeFreqLabelPosSize() + pdb->computeFreqMatrixPosSize(); cout << "size of record is " << recordSize << endl; if (!valueRecord.Write(&recordSize, sizeof(unsigned long long int), offset)) { return false; } offset += sizeof(unsigned long long int); char* recordChars = new char[recordSize]; size_t recordOffset = 0; unsigned int noProjections(pdb->projections.size()), noFreqLabelPos(pdb->freqLabelPos.size()), noFreqMatrixPos(pdb->freqMatrixPos.size()), noSequences, noLabels, label, freqLabelPos; // store projections memcpy(recordChars + recordOffset, &noProjections, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); for (unsigned int i = 0; i < noProjections; i++) { noSequences = pdb->projections[i].size(); memcpy(recordChars + recordOffset, &noSequences, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); for (unsigned int j = 0; j < noSequences; j++) { noLabels = pdb->projections[i][j].size(); memcpy(recordChars + recordOffset, &noLabels, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); for (unsigned int k = 0; k < noLabels; k++) { label = pdb->projections[i][j][k]; memcpy(recordChars + recordOffset, &label, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); } } } // store smatrix memcpy(recordChars + recordOffset, &pdb->smatrix.size, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); memcpy(recordChars + recordOffset, pdb->smatrix.values, pdb->smatrix.size * pdb->smatrix.size * sizeof(unsigned int)); recordOffset += pdb->smatrix.size * pdb->smatrix.size * sizeof(unsigned int); // for (unsigned int i = 0; i < pdb->smatrix.size; i++) { // for (unsigned int j = 0; j < pdb->smatrix.size; j++) { // matrixEntry = pdb->smatrix(i, j); // memcpy(recordChars + recordOffset, &matrixEntry, sizeof(unsigned int)); // recordOffset += sizeof(unsigned int); // } // } // store freqLabelPos memcpy(recordChars + recordOffset, &noFreqLabelPos, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); for (unsigned int i = 0; i < noFreqLabelPos; i++) { freqLabelPos = pdb->freqLabelPos[i]; memcpy(recordChars + recordOffset, &freqLabelPos, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); } // store freqMatrixPos memcpy(recordChars + recordOffset, &noFreqMatrixPos, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); for (auto freqPos : pdb->freqMatrixPos) { memcpy(recordChars + recordOffset, &freqPos.first, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); memcpy(recordChars + recordOffset, &freqPos.second, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); } if (!valueRecord.Write(recordChars, recordSize, offset)) { return false; } offset += recordSize; delete[] recordChars; return true; } bool ProjectedDB::Open(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value) { // auto measureStart = high_resolution_clock::now(); ProjectedDB *pdb = new ProjectedDB(); // read minSupp if (!valueRecord.Read(&pdb->minSupp, sizeof(double), offset)) { return false; } offset += sizeof(double); pdb->agg = new RelAgg(); if (!RelAgg::readFromRecord(pdb->agg, valueRecord, offset)) { return false; } pdb->minSuppCnt = (unsigned int)std::ceil(pdb->minSupp * pdb->agg->noTuples); // open projections vector > projection; vector labelSeq; unsigned long long int recordSize; if (!valueRecord.Read(&recordSize, sizeof(unsigned long long int), offset)) { return false; } offset += sizeof(unsigned long long int); // cout << "size of record is " << recordSize << endl; char* recordChars = new char[recordSize]; if (!valueRecord.Read(recordChars, recordSize, offset)) { return false; } offset += recordSize; size_t recordOffset = 0; unsigned int noProjections, noSequences, noLabels, label, noFreqLabelPos, matrixSize, freqLabelPos, noFreqMatrixPos; memcpy(&noProjections, recordChars + recordOffset, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); for (unsigned int i = 0; i < noProjections; i++) { memcpy(&noSequences, recordChars + recordOffset, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); for (unsigned int j = 0; j < noSequences; j++) { memcpy(&noLabels, recordChars + recordOffset, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); for (unsigned int k = 0; k < noLabels; k++) { memcpy(&label, recordChars + recordOffset, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); labelSeq.push_back(label); } projection.push_back(labelSeq); labelSeq.clear(); } pdb->projections.push_back(projection); projection.clear(); } // read smatrix memcpy(&matrixSize, recordChars + recordOffset, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); pdb->smatrix.init(matrixSize); memcpy(pdb->smatrix.values, recordChars + recordOffset, pdb->smatrix.size * pdb->smatrix.size * sizeof(unsigned int)); recordOffset += pdb->smatrix.size * pdb->smatrix.size * sizeof(unsigned int); // for (unsigned int i = 0; i < matrixSize; i++) { // for (unsigned int j = 0; j < matrixSize; j++) { // memcpy(&matrixEntry, recordChars + recordOffset, sizeof(unsigned int)); // recordOffset += sizeof(unsigned int); // pdb->smatrix.set(i, j, matrixEntry); // } // } // read freqLabelPos memcpy(&noFreqLabelPos, recordChars + recordOffset, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); for (unsigned int i = 0; i < noFreqLabelPos; i++) { memcpy(&freqLabelPos, recordChars + recordOffset, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); pdb->freqLabelPos.push_back(freqLabelPos); } value.setAddr(pdb); // measureStop = high_resolution_clock::now(); // ms = // (double)(duration_cast(measureStop - measureStart).count()); // read freqMatrixPos memcpy(&noFreqMatrixPos, recordChars + recordOffset, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); NewPair freqPos; for (unsigned int i = 0; i < noFreqMatrixPos; i++) { memcpy(&freqPos.first, recordChars + recordOffset, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); memcpy(&freqPos.second, recordChars + recordOffset, sizeof(unsigned int)); recordOffset += sizeof(unsigned int); pdb->freqMatrixPos.push_back(freqPos); } delete[] recordChars; return true; } void ProjectedDB::Close(const ListExpr typeInfo, Word& w) { ProjectedDB *pdb = (ProjectedDB*)w.addr; delete pdb; w.addr = 0; } Word ProjectedDB::Clone(const ListExpr typeInfo, const Word& w) { ProjectedDB *pdb = (ProjectedDB*)w.addr; Word res; res.addr = new ProjectedDB(*pdb); return res; } int ProjectedDB::SizeOfObj() { return sizeof(ProjectedDB); } bool ProjectedDB::TypeCheck(ListExpr type, ListExpr& errorInfo) { return nl->IsEqual(type, BasicType()); } /* Type constructor for secondo type ~projecteddb~ */ TypeConstructor projecteddbTC( ProjectedDB::BasicType(), ProjectedDB::Property, ProjectedDB::Out, ProjectedDB::In, 0, 0, ProjectedDB::Create, ProjectedDB::Delete, ProjectedDB::Open, ProjectedDB::Save, ProjectedDB::Close, ProjectedDB::Clone, 0, ProjectedDB::SizeOfObj, ProjectedDB::TypeCheck); GetPatternsLI::GetPatternsLI(Relation *r, const NewPair ap, double ms, int mina, int maxa, Geoid *g, const size_t mem) { tupleType = getTupleType(); agg.clear(); agg.scanRelation(r, ap, g); agg.filter(ms, mem); agg.derivePatterns(mina, maxa); } GetPatternsLI::~GetPatternsLI() { tupleType->DeleteIfAllowed(); } TupleType* GetPatternsLI::getTupleType() { SecondoCatalog* sc = SecondoSystem::GetCatalog(); ListExpr resultTupleType = nl->TwoElemList( nl->SymbolAtom(Tuple::BasicType()), nl->TwoElemList(nl->TwoElemList(nl->SymbolAtom("Pattern"), nl->SymbolAtom(FText::BasicType())), nl->TwoElemList(nl->SymbolAtom("Support"), nl->SymbolAtom(CcReal::BasicType())))); ListExpr numResultTupleType = sc->NumericType(resultTupleType); return new TupleType(numResultTupleType); } Tuple* GetPatternsLI::getNextResult(RelAgg& agg, TupleType *tt) { if (agg.results.empty()) { return 0; } Tuple *tuple = new Tuple(tt); NewPair result; result = agg.results.back(); agg.results.pop_back(); FText *pattern = new FText(true, result.first); tuple->PutAttribute(0, pattern); CcReal *support = new CcReal(true, result.second); tuple->PutAttribute(1, support); return tuple; } MineFPTreeLI::MineFPTreeLI(FPTree *t, int mina, int maxa) : tree(t), minNoAtoms(mina), maxNoAtoms(maxa) { tupleType = GetPatternsLI::getTupleType(); tree->retrievePatterns(minNoAtoms, maxNoAtoms); } MineFPTreeLI::~MineFPTreeLI() { // tree->agg->clearEntries(); delete tree->agg; tupleType->DeleteIfAllowed(); } Tuple* MineFPTreeLI::getNextResult() { return GetPatternsLI::getNextResult(*(tree->agg), tupleType); } PrefixSpanLI::PrefixSpanLI(ProjectedDB *db, int mina, int maxa) : pdb(db), minNoAtoms(mina), maxNoAtoms(maxa) { tupleType = GetPatternsLI::getTupleType(); pdb->retrievePatterns(minNoAtoms, maxNoAtoms); } PrefixSpanLI::~PrefixSpanLI() { delete pdb->agg; tupleType->DeleteIfAllowed(); } Tuple* PrefixSpanLI::getNextResult() { return GetPatternsLI::getNextResult(*(pdb->agg), tupleType); } /* Class ~AggEntry~, function ~sequentialJoin~, applied by operator ~spade~ keep periods of ~entry2~ if occurring later than ~entry1~'s */ void AggEntry::sequentialJoin(AggEntry& entry1, AggEntry& entry2) { if (entry1.occs.empty() || entry2.occs.empty()) { return; } unsigned int pos1(0), pos2(0); SecInterval iv1(true), iv2(true); iv1.start.SetType(instanttype); iv2.end.SetType(instanttype); Periods *per1(0), *per2(0); TupleId id1(0), id2(0); Rect rect(true); while (pos1 < entry1.occs.size() && pos2 < entry2.occs.size()) { id1 = get<0>(entry1.occs[pos1]); id2 = get<0>(entry2.occs[pos2]); if (id1 < id2) { pos1++; } else if (id1 > id2) { pos2++; } else { // tuple ids match per1 = get<1>(entry1.occs[pos1]); per2 = get<1>(entry2.occs[pos2]); if (per1->IsDefined() && per2->IsDefined()) { if (!per1->IsEmpty() && !per2->IsEmpty()) { per1->Get(0, iv1); per2->Get(per2->GetNoComponents() - 1, iv2); if (iv1.start < iv2.end) { Periods *per = new Periods(*per2); rect = get<2>(entry1.occs[pos1]).Union(get<2>(entry2.occs[pos2])); occsPos.push_back(occs.size()); occs.push_back(make_tuple(id1, per, rect)); noOccs++; // TODO: update duration } } } pos1++; pos2++; } } } /* Class ~RelAgg~, function ~combineFrom~, applied by operator ~spade~ */ void RelAgg::combineEntries(unsigned int endOfPrefix, RelAgg *ra, unsigned int label, unsigned int minSuppCnt) { AggEntry newEntry; // cout << "Join entries " << endOfPrefix << " and " << label << endl; newEntry.sequentialJoin(ra->entries[endOfPrefix], ra->entries[label]); if (newEntry.occs.size() >= minSuppCnt) { labelPos.insert(make_pair(ra->freqLabels[label], label)); entries[label] = newEntry; freqLabels[label] = ra->freqLabels[label]; // cout << "... added entry # " << label << " for " // << ra->freqLabels[label] << ", has " << newEntry.occs.size() // << " occs" << endl; } else { freqLabels[label] = ""; } } /* Class ~VerticalDB~, function ~mineVerticalDB~ */ void VerticalDB::mineVerticalDB(vector& prefix, string& patPrefix, RelAgg *ra, const unsigned int minNoAtoms, const unsigned int maxNoAtoms) { if (prefix.empty()) { return; } // cout << "mVDB called with prefix " << ra->print(prefix) // << ", " << ra->freqLabels.size() << " frequent labels and " // << ra->labelPos.size() << " label pos" << endl; string atom, patPrefixExt; set commonTupleIds; double supp = 0.0; RelAgg *newAgg = new RelAgg(ra); for (auto label : ra->labelPos) { bool contained = false; unsigned int prefixPos = 0; while (!contained && prefixPos < prefix.size()) { if (label.second == prefix[prefixPos]) { contained = true; } prefixPos++; } if (!contained) { newAgg->combineEntries(prefix[prefix.size() - 1], ra, label.second, minSuppCnt); // if (!newAgg->entries[label.second].occs.empty()) { // cout << "isCombFrequent completed for prefix " << ra->print(prefix) // << " and label " << label.second << "; " // << ra->freqLabels[label.second] << " | " // << newAgg->freqLabels[label.second] << " || " // << newAgg->entries[label.second].occs.size() << " occs" << endl; // } } } for (auto label : newAgg->labelPos) { newAgg->buildAtom(label.second, newAgg->entries[label.second], commonTupleIds, atom); supp = (double)newAgg->entries[label.second].occs.size() / newAgg->noTuples; patPrefixExt = patPrefix + atom; // cout << "RESULT: " << patPrefixExt << ", supp = " << supp << endl; if (minNoAtoms <= prefix.size() + 1) { agg->results.push_back(NewPair(patPrefixExt, supp)); } if (prefix.size() + 1 < maxNoAtoms) { prefix.push_back(label.second); mineVerticalDB(prefix, patPrefixExt, newAgg, minNoAtoms, maxNoAtoms); prefix.pop_back(); } } delete newAgg; } /* Class ~VerticalDB~, function ~retrievePatterns~ */ void VerticalDB::retrievePatterns(const unsigned int minNoAtoms, const unsigned int maxNoAtoms) { vector prefix; string pattern, atom; set commonTupleIds; double supp = 1.0; for (unsigned int l = 0; l < agg->freqLabels.size(); l++) { agg->buildAtom(l, agg->entries[l], commonTupleIds, atom); if (minNoAtoms == 1) { supp = double(agg->entries[l].occs.size()) / agg->noTuples; agg->results.push_back(NewPair(atom, supp)); } if (maxNoAtoms > 1) { prefix.push_back(l); atom += ""; mineVerticalDB(prefix, atom, agg, minNoAtoms, maxNoAtoms); prefix.pop_back(); } } std::sort(agg->results.begin(), agg->results.end(), comparePMResults()); } /* Class ~VerticalDB~, function ~construct~ */ void VerticalDB::construct() { cout << "frequent labels: " << agg->print(agg->freqLabels) << endl; } /* Class ~VerticalDB~, function ~initialize~ */ void VerticalDB::initialize(const double ms, RelAgg *ra) { minSupp = ms; agg = ra; minSuppCnt = (unsigned int)std::ceil(minSupp * ra->noTuples); } /* Class ~VerticalDB~, functions for Secondo data type */ ListExpr VerticalDB::Property() { return (nl->TwoElemList( nl->FourElemList( nl->StringAtom("Signature"), nl->StringAtom("Example Type List"), nl->StringAtom("List Rep"), nl->StringAtom("Example List")), nl->FourElemList ( nl->StringAtom("-> SIMPLE"), nl->StringAtom(VerticalDB::BasicType()), nl->StringAtom("no list representation"), nl->StringAtom("")))); } Word VerticalDB::In(const ListExpr typeInfo, const ListExpr instance, const int errorPos, ListExpr& errorInfo, bool& correct) { correct = false; return SetWord(Address(0)); } ListExpr VerticalDB::Out(ListExpr typeInfo, Word value) { VerticalDB *vdb = (VerticalDB*)value.addr; ListExpr noTuplesList = nl->TwoElemList(nl->SymbolAtom("noTuples"), nl->IntAtom(vdb->agg->noTuples)); ListExpr minSuppList = nl->TwoElemList(nl->SymbolAtom("minSupp"), nl->RealAtom(vdb->minSupp)); return nl->TwoElemList(noTuplesList, minSuppList); } Word VerticalDB::Create(const ListExpr typeInfo) { Word w; w.addr = (new VerticalDB()); return w; } void VerticalDB::Delete(const ListExpr typeInfo, Word& w) { VerticalDB *vdb = (VerticalDB*)w.addr; delete vdb; w.addr = 0; } bool VerticalDB::Save(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value) { VerticalDB *vdb = (VerticalDB*)value.addr; // store minSupp if (!valueRecord.Write(&vdb->minSupp, sizeof(double), offset)) { return false; } offset += sizeof(double); // store agg if (!RelAgg::saveToRecord(vdb->agg, valueRecord, offset)) { return false; } return true; } bool VerticalDB::Open(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value) { // auto measureStart = high_resolution_clock::now(); VerticalDB *vdb = new VerticalDB(); // read minSupp if (!valueRecord.Read(&vdb->minSupp, sizeof(double), offset)) { return false; } offset += sizeof(double); vdb->agg = new RelAgg(); if (!RelAgg::readFromRecord(vdb->agg, valueRecord, offset)) { return false; } vdb->minSuppCnt = (unsigned int)std::ceil(vdb->minSupp * vdb->agg->noTuples); value.setAddr(vdb); // auto measureStop = high_resolution_clock::now(); // double ms = // (double)(duration_cast(measureStop - measureStart).count()); // cout << "VDB OPEN finished after " << ms << " ms" << endl; return true; } void VerticalDB::Close(const ListExpr typeInfo, Word& w) { VerticalDB *vdb = (VerticalDB*)w.addr; delete vdb; w.addr = 0; } Word VerticalDB::Clone(const ListExpr typeInfo, const Word& w) { VerticalDB *vdb = (VerticalDB*)w.addr; Word res; res.addr = new VerticalDB(*vdb); return res; } int VerticalDB::SizeOfObj() { return sizeof(VerticalDB); } bool VerticalDB::TypeCheck(ListExpr type, ListExpr& errorInfo) { return nl->IsEqual(type, BasicType()); } /* Type constructor for secondo type ~projecteddb~ */ TypeConstructor verticaldbTC( VerticalDB::BasicType(), VerticalDB::Property, VerticalDB::Out, VerticalDB::In, 0, 0, VerticalDB::Create, VerticalDB::Delete, VerticalDB::Open, VerticalDB::Save, VerticalDB::Close, VerticalDB::Clone, 0, VerticalDB::SizeOfObj, VerticalDB::TypeCheck); SpadeLI::SpadeLI(VerticalDB *db, int mina, int maxa) : vdb(db), minNoAtoms(mina), maxNoAtoms(maxa) { tupleType = GetPatternsLI::getTupleType(); vdb->retrievePatterns(minNoAtoms, maxNoAtoms); } SpadeLI::~SpadeLI() { delete vdb->agg; tupleType->DeleteIfAllowed(); } Tuple* SpadeLI::getNextResult() { return GetPatternsLI::getNextResult(*(vdb->agg), tupleType); } /* Class ~SplSemTraj~, functions for Secondo data type used for splitter */ int tsPlaceCmp(const void *a, const void *b) { SplTSPlace *tsp1 = new ((void*)a)SplTSPlace, *tsp2 = new ((void*)b)SplTSPlace; if (tsp1->instDbl == tsp2->instDbl) { return 0; } return tsp1->instDbl < tsp2->instDbl ? -1 : 1; } ListExpr SplSemTraj::Property() { return nl->TwoElemList( nl->FourElemList( nl->StringAtom("Signature"), nl->StringAtom("Example Type List"), nl->StringAtom("List Rep"), nl->StringAtom("Example List")), nl->FourElemList ( nl->StringAtom("-> SIMPLE"), nl->StringAtom(SplSemTraj::BasicType()), nl->StringAtom("((t_1, l_1, c_1), ..., (t_n, l_n, c_n))"), nl->StringAtom("((2021-09-09-12:45, (7.45, 51.49), \"Stadion\"))"))); } // Word SplSemTraj::In(const ListExpr typeInfo, const ListExpr instance, // const int errorPos, ListExpr& errorInfo, bool& correct) { // correct = false; // return SetWord(Address(0)); // } bool SplSemTraj::ReadFrom(ListExpr LE, const ListExpr typeInfo) { clear(); if (listutils::isSymbolUndefined(LE)) { SetDefined(false); return true; } SetDefined(true); if (nl->IsEmpty(LE)) { return true; } if (nl->IsAtom(LE)) { return false; } ListExpr rest = LE; SplTSPlace tsp; while (!nl->IsEmpty(rest)) { ListExpr first = nl->First(rest); if (!tsp.fromList(first)) { return false; } append(tsp); rest = nl->Rest(rest); } sort(); return true; } ListExpr SplSemTraj::ToListExpr(ListExpr typeInfo) const { if (!IsDefined()) { return listutils::getUndefined(); } if (isEmpty()) { return nl->Empty(); } ListExpr resultList = nl->OneElemList(get(0).toListExpr()); ListExpr resultListTemp = resultList; for (int i = 1; i < size(); i++) { resultListTemp = nl->Append(resultListTemp, get(i).toListExpr()); } return resultList; } int SplSemTraj::Compare(const Attribute* arg) const { if (!IsDefined()) { return arg->IsDefined() ? -1 : 0; } if (!arg->IsDefined()) { return 1; } SplSemTraj* sst = (SplSemTraj*)arg; if (isEmpty()) { return sst->isEmpty() ? 0 : 1; } if (sst->isEmpty()) { return 1; } if (size() != sst->size()) { return size() < sst->size() ? -1 : 1; } for (int i = 0; i < size(); i++) { SplTSPlace ts1 = get(i); SplTSPlace ts2 = sst->get(i); if (ts1.instDbl != ts2.instDbl) { return ts1.instDbl < ts2.instDbl; } } return 0; } bool SplSemTraj::Adjacent(const Attribute* arg) const { return false; } Attribute* SplSemTraj::Clone() const { return new SplSemTraj(*this); } size_t SplSemTraj::HashValue() const { if (!IsDefined() || isEmpty()) { return 0; } return firstInst().GetAllMilliSeconds() * size() % 1024; } void SplSemTraj::CopyFrom(const Attribute* arg) { tsPlaces.copyFrom(((SplSemTraj*)arg)->getTSPlaces()); } bool SplSemTraj::CheckKind(ListExpr type, ListExpr& errorInfo) { return checkType(type); } size_t SplSemTraj::Sizeof() const { return sizeof(SplSemTraj); } void SplSemTraj::sort() { tsPlaces.Sort(tsPlaceCmp); } int SplSemTraj::find(const SplPlace& sp, const double tolerance, const Geoid *geoid) const { Point p1(true), p2(true, sp.x, sp.y); for (int i = 0; i < size(); i++) { SplTSPlace tsp = get(i); if (tsp.cat == sp.cat) { p1.Set(tsp.x, tsp.y); if (p1.Distance(p2, geoid) <= tolerance) { return i; } } } return -1; } set SplSemTraj::getPositions(string label) const { set result; for (int i = 0; i < size(); i++) { SplTSPlace tsp = get(i); string currentLabel(tsp.cat); if (currentLabel == label) { result.insert(i); } } return result; } void SplSemTraj::convertFromMPointMLabel(const MPoint& mp, const MLabel& ml, const double tolerance, const Geoid *geoid /* = 0 */) { clear(); DateTime inst(instanttype); UPoint up(true); IPoint ip(true); Point pt(true), lastpt(false); ILabel il(true); string cat, lastcat; for (int i = 0; i < mp.GetNoComponents(); i++) { mp.Get(i, up); inst = up.timeInterval.start; up.TemporalFunction(inst, pt, geoid, true); ml.AtInstant(inst, il); if (il.value.IsDefined()) { cat = il.value.GetLabel().substr(0, 48); if (i == 0 || cat != lastcat || pt.Distance(lastpt, geoid) > tolerance) { SplTSPlace tsPlace(inst, pt, cat); append(tsPlace); lastpt = pt; lastcat = cat; } } } } bool SplSemTraj::contains(const SplPlace& sp, const double deltaT, const double tolerance, const Geoid* geoid) const { Point p1(true), p2(true, sp.x, sp.y); for (int i = 0; i < size(); i++) { SplTSPlace tsp = get(i); p1.Set(tsp.x, tsp.y); if (tsp.cat == sp.cat && p1.Distance(p2, geoid) <= tolerance && tsp.instDbl < deltaT) { return true; } } return false; } SplSemTraj SplSemTraj::postfix(const int pos) const { assert(pos >= 0 && pos < size()); SplSemTraj result(1); DateTime t_m(0.0), t_i(0.0); t_m.ReadFrom(get(pos).instDbl); t_m.SetType(durationtype); for (int i = pos + 1; i < size(); i++) { SplTSPlace tsp = get(i); t_i.ReadFrom(tsp.instDbl); t_i -= t_m; tsp.instDbl = t_i.ToDouble(); result.append(tsp); } return result; } void SplSemTraj::addPostfixes(SplPlace sp, const double eps, const Geoid* geoid, vector& result) const { set labelPos = getPositions(sp.cat); for (auto it : labelPos) { if (it >= 1) { SplSemTraj pf = postfix(it); if (!pf.isEmpty() && !pf.first().almostEqual(sp, eps, geoid)) { result.push_back(pf); } } } } /* Implementation of class ~Splitter~, used for operator ~splitter~ */ Splitter::Splitter(Word& s, const double sm, DateTime& mtt, const int mna, const double e, Geoid* g, const int attrNo) : pos(0), deltaT(mtt), maxNoAtoms(mna), eps(e), geoid(g) { tupleType = getTupleType(); initialProjection(s, sm, attrNo); } Splitter::~Splitter() { tupleType->DeleteIfAllowed(); } TupleType* Splitter::getTupleType() { SecondoCatalog* sc = SecondoSystem::GetCatalog(); ListExpr resultTupleType = nl->TwoElemList( nl->SymbolAtom(Tuple::BasicType()), nl->TwoElemList(nl->TwoElemList(nl->SymbolAtom("Pattern"), nl->SymbolAtom(FText::BasicType())), nl->TwoElemList(nl->SymbolAtom("Support"), nl->SymbolAtom(CcReal::BasicType())))); ListExpr numResultTupleType = sc->NumericType(resultTupleType); return new TupleType(numResultTupleType); } void Splitter::initialProjection(Word& s, const double sm, const int attrNo) { map, SplPlaceSorter> freqItems(SplPlaceSorter(eps, geoid)); computeFrequentItems(s, attrNo, sm, freqItems); // cout << freqItemsToString(freqItems) << endl; if (maxNoAtoms == 1) { for (auto it : freqItems) { SplSemTraj sst(1); SplTSPlace tsp(DateTime(0.0), Point(true, it.first.x, it.first.y), it.first.cat); sst.append(tsp); addSnippets(sst, it.second.size()); } } else { for (auto it : freqItems) { vector postfixes; for (auto i : it.second) { source[i].addPostfixes(it.first, eps, geoid, postfixes); } SplSemTraj sst(1); SplTSPlace tsp(DateTime(0.0), Point(true, it.first.x, it.first.y), it.first.cat); sst.append(tsp); addSnippets(sst, it.second.size()); prefixSpan(sst, postfixes); } } } void Splitter::prefixSpan(SplSemTraj& prefix, vector pf) { if (prefix.size() >= maxNoAtoms) { return; } map, SplPlaceSorter> localFreqItems(SplPlaceSorter(eps, geoid)); // cout << "Call prefixSpan for sst " << prefix.toString() << endl; computeLocalFreqItems(prefix.last(), pf, localFreqItems); // cout << freqItemsToString(localFreqItems) << endl; map, SplPlaceSorter> newPostfixes(SplPlaceSorter(eps, geoid)); for (auto it : localFreqItems) { SplSemTraj p(prefix); SplTSPlace tsp(DateTime(0.0), Point(true, it.first.x, it.first.y), it.first.cat); p.append(tsp); vector postfixes; for (auto pos : it.second) { pf[pos].addPostfixes(it.first, eps, geoid, postfixes); } addSnippets(p, it.second.size()); // cout << postfixesToString(p, postfixes) << endl; prefixSpan(p, postfixes); } } string Splitter::postfixesToString(SplSemTraj pref, vector& pf) { stringstream str; str << "Postfixes of " << pref.toString() << " : " << endl; for (auto sst : pf) { str << " " << sst.toString() << endl; } return str.str(); } string Splitter::freqItemsToString(map, SplPlaceSorter>& freqItems) { stringstream str; if (freqItems.empty()) { str << "No frequent items" << endl; return str.str(); } for (auto it : freqItems) { str << "Freq item " << it.first.toString() << " : "; for (auto it2 : it.second) { str << it2 << ", "; } str << endl; } str << endl; return str.str(); } void Splitter::computeFrequentItems(Word& s, const int attrNo, const double sm, map, SplPlaceSorter>& freqItems) { // collect all SplPlaces with occurrences map, SplPlaceSorter> allItems(SplPlaceSorter(eps, geoid)); Stream stream(s); stream.open(); Tuple* tuple = stream.request(); int counter = 0; while (tuple) { SplSemTraj sst(*(SplSemTraj*)(tuple->GetAttribute(attrNo))); for (int i = 0; i < sst.size(); i++) { SplPlace sp(sst.get(i)); allItems[sp].insert(counter); } source.push_back(sst); tuple->DeleteIfAllowed(); tuple = stream.request(); counter++; } stream.close(); // retrieve frequent labels freqmin = ceil(counter * sm); for (auto it : allItems) { if (it.second.size() >= freqmin) { freqItems.insert(it); } } } void Splitter::computeLocalFreqItems(SplTSPlace tsp, vector pf, map, SplPlaceSorter >& freqItems) { map, SplPlaceSorter> allItems(SplPlaceSorter(eps, geoid)); int counter = 0; for (auto it : pf) { for (int i = 0; i < it.size(); i++) { SplTSPlace spl(it.get(i)); if (i > 0 || !spl.almostEqual(tsp, eps, geoid)) { // first place != spl if (spl.instDbl < deltaT.ToDouble()) { // allItems[spl].insert(counter); } } } counter++; } for (auto it : allItems) { if (it.second.size() >= freqmin) { freqItems.insert(it); } } } void Splitter::addSnippets(SplSemTraj sst, const int freq) { if (sst.isEmpty()) { return; } result.push_back(make_pair(sst, (double)freq / source.size())); } Tuple* Splitter::next() { assert(pos <= result.size()); if (pos == result.size()) { return 0; } Tuple *tuple = new Tuple(tupleType); FText* pattern = new FText(true, result[pos].first.toString()); tuple->PutAttribute(0, pattern); CcReal* ccsupp = new CcReal(true, result[pos].second); tuple->PutAttribute(1, ccsupp); pos++; return tuple; } }