Files
secondo/Algebras/SymbolicTrajectory/PatternMining.cpp
2026-01-23 17:03:45 +08:00

3411 lines
111 KiB
C++

/*
----
This file is part of SECONDO.
Copyright (C) 2004, University in Hagen, Department of Computer Science,
Database Systems for New Applications.
SECONDO is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
SECONDO is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SECONDO; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
----
//paragraph [1] Title: [{\Large \bf \begin {center}] [\end {center}}]
//[TOC] [\tableofcontents]
Started November 2019, Fabio Vald\'{e}s
*/
#include "PatternMining.h"
#include <chrono>
using namespace std;
using namespace datetime;
using namespace temporalalgebra;
using namespace std::chrono;
namespace stj {
AggEntry::AggEntry() {
occs.clear();
occsPos.clear();
noOccs = 0;
duration.SetType(datetime::durationtype);
duration.ReadFrom((int64_t)0);
}
AggEntry::AggEntry(const TupleId id, const temporalalgebra::SecInterval& iv,
Rect& rect, const unsigned int noTuples) {
occs.clear();
occsPos.clear();
occsPos.resize(noTuples + 1, UINT_MAX);
Periods *per = new Periods(1);
per->Add(iv);
occsPos[id] = occs.size();
occs.push_back(make_tuple(id, per, rect));
noOccs = 0;
duration.SetType(datetime::durationtype);
duration.ReadFrom((int64_t)0); // durations are computed at the end
}
void AggEntry::clear() {
for (auto it : occs) {
get<1>(it)->DeleteIfAllowed();
}
occs.clear();
occsPos.clear();
noOccs = 0;
duration.SetType(datetime::durationtype);
duration.ReadFrom((int64_t)0);
}
void AggEntry::deletePeriods() {
for (auto it : occs) {
(get<1>(it))->DeleteIfAllowed();
}
}
ListExpr AggEntry::toListExpr() {
ListExpr occsList(nl->Empty()), occList(nl->Empty());
Word perWord, rectWord;
TupleId id;
if (occs.size() >= 1) {
id = get<0>(occs[0]);
perWord.addr = get<1>(occs[0]);
rectWord.addr = &(get<2>(occs[0]));
occsList = nl->OneElemList(nl->ThreeElemList(nl->IntAtom(id),
OutRange<Instant, OutDateTime>(nl->Empty(), perWord),
OutRectangle<2>(nl->Empty(), rectWord)));
occList = occsList;
}
for (unsigned int i = 1; i < occs.size(); i++) {
id = get<0>(occs[i]);
perWord.addr = get<1>(occs[i]);
rectWord.addr = &(get<2>(occs[i]));
occList = nl->Append(occList, nl->ThreeElemList(nl->IntAtom(id),
OutRange<Instant, OutDateTime>(nl->Empty(), perWord),
OutRectangle<2>(nl->Empty(), rectWord)));
}
return nl->ThreeElemList(nl->IntAtom(noOccs), duration.ToListExpr(false),
occsList);
}
unsigned int AggEntry::getNoOccs(const TupleId& id) const {
return get<1>(occs[occsPos[id]])->GetNoComponents();
}
void AggEntry::computeCommonTimeInterval(const set<TupleId>& commonTupleIds,
SecInterval& iv) {
iv.start.SetType(instanttype);
iv.end.SetType(instanttype);
iv.start.ToMaximum();
iv.end.ToMinimum();
Instant first(1.0), last(1.0);
if (commonTupleIds.empty()) { // use all occurrences
for (auto it : occs) {
if (get<1>(it)->IsDefined()) {
if (!get<1>(it)->IsEmpty()) {
get<1>(it)->Minimum(first);
get<1>(it)->Maximum(last);
if (first < iv.start) {
iv.start = first;
}
if (last > iv.end) {
iv.end = last;
}
}
}
}
}
else { // use only entries occurring in the set
for (auto it : commonTupleIds) {
if (occsPos[it] != UINT_MAX) {
Periods* per = get<1>(occs[occsPos[it]]);
if (per->IsDefined()) {
if (!per->IsEmpty()) {
per->Minimum(first);
per->Maximum(last);
if (first < iv.start) {
iv.start = first;
}
if (last > iv.end) {
iv.end = last;
}
}
}
}
}
}
if (iv.start.IsMinimum() || iv.end.IsMaximum()) {
iv.SetDefined(false);
}
}
void AggEntry::computeCommonRect(const SecInterval& iv,
const set<TupleId>& commonTupleIds, Geoid *geoid, Rect &rect) {
if (commonTupleIds.empty()) { // use all occurrences (for 1-patterns)
if (occs.empty()) {
return;
}
if (!get<2>(occs[0]).IsDefined()) {
rect.SetDefined(false);
return;
}
Rect tempRect = get<2>(occs[0]);
for (unsigned int i = 1; i < occs.size(); i++) {
if (!get<2>(occs[i]).IsDefined()) {
rect.SetDefined(false);
return;
}
tempRect = get<2>(occs[i]).Union(tempRect);
}
rect = tempRect;
}
else {
set<TupleId>::iterator it = commonTupleIds.begin();
if (!get<2>(occs[occsPos[*it]]).IsDefined()) {
rect.SetDefined(false);
return;
}
Rect tempRect = get<2>(occs[occsPos[*it]]);
while (it != commonTupleIds.end()) {
if (!get<2>(occs[occsPos[*it]]).IsDefined()) {
rect.SetDefined(false);
return;
}
tempRect = get<2>(occs[occsPos[*it]]).Union(tempRect);
it++;
}
rect = tempRect;
}
}
void AggEntry::computeSemanticTimeSpec(const set<TupleId>& commonTupleIds,
string& semanticTimeSpec) const {
semanticTimeSpec.clear();
int month = 1; // {1, ..., 12}
int weekday = 0; // {0, ..., 6}
int daytime = 0; // {0, ..., 3}
Instant first(1.0), last(1.0);
Periods *per = 0;
for (set<TupleId>::const_iterator it = commonTupleIds.begin();
it != commonTupleIds.end(); it++) {
per = get<1>(occs[occsPos[*it]]);
if (!per->IsDefined()) {
semanticTimeSpec = "";
return;
}
if (per->IsEmpty()) {
semanticTimeSpec = "";
return;
}
per->Minimum(first);
per->Maximum(last);
if (it == commonTupleIds.begin() && first.GetMonth() == last.GetMonth()) {
month = first.GetMonth();
}
else if (month != first.GetMonth()
|| first.GetMonth() != last.GetMonth()) {
month = -1;
}
if (it == commonTupleIds.begin()
&& first.GetWeekday() == last.GetWeekday()) {
weekday = first.GetWeekday();
}
else if (weekday != first.GetWeekday()
|| first.GetWeekday() != last.GetWeekday()) {
weekday = -1;
}
if (it == commonTupleIds.begin()
&& Tools::getDaytime(first.GetHour()) == Tools::getDaytime(last.GetHour())) {
daytime = Tools::getDaytime(first.GetHour());
}
else if (daytime != Tools::getDaytime(first.GetHour())
|| Tools::getDaytime(first.GetMonth()) !=
Tools::getDaytime(last.GetMonth())) {
daytime = -1;
}
if (month == -1 && weekday == -1 && daytime == -1) {
semanticTimeSpec = "";
return;
}
}
if (month > -1) {
semanticTimeSpec = Tools::getMonthStr(month - 1);
}
if (weekday > -1) {
semanticTimeSpec += (semanticTimeSpec.empty() ? "" : ", ")
+ Tools::getWeekdayStr(weekday);
}
if (daytime > -1) {
semanticTimeSpec += (semanticTimeSpec.empty() ? "" : ", ")
+ Tools::getDaytimeStr(daytime);
}
}
std::string AggEntry::print(const TupleId& id /* = 0 */) const {
std::stringstream result;
if (id == 0) { // print everything
for (auto it : occs) {
result << " TID " << get<0>(it) << ": " << get<1>(it)->GetNoComponents()
<< " occs: " << *(get<1>(it)) << get<2>(it) << endl;
}
}
else {
if (occsPos[id] == UINT_MAX) { // id not found
result << " TID " << id << " not found" << endl;
}
else {
result << " TID " << id << ": "
<< get<1>(occs[occsPos[id]])->GetNoComponents() << " occs: "
<< *(get<1>(occs[occsPos[id]])) << get<2>(occs[occsPos[id]])
<< endl;
}
}
return result.str();
}
std::string AggEntry::print(const Rect& rect) const {
std::stringstream result;
if (!rect.IsDefined()) {
return "_";
}
result << "[" << rect.MinD(0) << " " << rect.MaxD(0) << " " << rect.MinD(1)
<< " " << rect.MaxD(1) << "]";
return result.str();
}
/*
Class ~RelAgg~, Constructors
*/
RelAgg::RelAgg() : noTuples(0), minNoAtoms(0), maxNoAtoms(0), minSupp(0.0),
geoid(0), rel(0) {}
RelAgg::RelAgg(RelAgg *ra) : noTuples(ra->noTuples), minNoAtoms(ra->minNoAtoms),
maxNoAtoms(ra->maxNoAtoms), minSupp(ra->minSupp),
geoid(ra->geoid), rel(ra->rel) {
AggEntry dummy;
entries.resize(ra->entries.size(), dummy);
freqLabels.resize(ra->freqLabels.size());
labelPos.clear();
checkedSeqs.clear();
freqSets.clear();
nonfreqSets.clear();
}
/*
Class ~RelAgg~, Function ~clear~
Deletes the periods values
*/
void RelAgg::clear() {
for (auto it : entriesMap) {
it.second.clear();
}
}
void RelAgg::clearEntries() {
for (auto it : entries) {
it.deletePeriods();
}
for (auto it : checkedSeqs) {
for (auto it2 : it) {
it2.clear();
}
it.clear();
}
checkedSeqs.clear();
for (auto it : freqSets) {
for (auto it2 : it) {
it2.clear();
}
it.clear();
}
freqSets.clear();
for (auto it : nonfreqSets) {
for (auto it2 : it) {
it2.clear();
}
it.clear();
}
nonfreqSets.clear();
}
ListExpr RelAgg::entriesToListExpr() {
ListExpr resultList, tempList;
if (entries.empty() || labelPos.empty()) {
return nl->SymbolAtom("Empty Container");
}
auto it = labelPos.begin();
resultList = nl->OneElemList(nl->TwoElemList(nl->SymbolAtom(it->first),
entries[it->second].toListExpr()));
tempList = resultList;
it++;
while (it != labelPos.end()) {
tempList = nl->Append(tempList, nl->TwoElemList(nl->SymbolAtom(it->first),
entries[it->second].toListExpr()));
it++;
}
return resultList;
}
bool RelAgg::saveToRecord(RelAgg *agg, SmiRecord& valueRecord, size_t& offset) {
unsigned int noOccs, tid, labelLength, noComponents;
double durD, start, end;
string label;
SecInterval iv(true);
if (!valueRecord.Write(&agg->noTuples, sizeof(unsigned int), offset)) {
return false;
}
offset += sizeof(unsigned int);
unsigned long long int entriesSize = agg->computeEntriesSize();
cout << "size of entries is " << entriesSize << endl;
if (!valueRecord.Write(&entriesSize, sizeof(unsigned long long int), offset)){
return false;
}
offset += sizeof(unsigned long long int);
unsigned int noAggEntries = agg->entries.size();
if (!valueRecord.Write(&noAggEntries, sizeof(unsigned int), offset)) {
return false;
}
offset += sizeof(unsigned int);
char* entriesChars = new char[entriesSize];
size_t offsetEntries = 0;
for (unsigned int i = 0; i < noAggEntries; i++) {
memcpy(entriesChars + offsetEntries, &agg->entries[i].noOccs,
sizeof(unsigned int));
offsetEntries += sizeof(unsigned int);
durD = agg->entries[i].duration.ToDouble();
memcpy(entriesChars + offsetEntries, &durD, sizeof(double));
offsetEntries += sizeof(double);
noOccs = agg->entries[i].occs.size();
memcpy(entriesChars + offsetEntries, &noOccs, sizeof(unsigned int));
offsetEntries += sizeof(unsigned int);
for (auto occ : agg->entries[i].occs) {
tid = get<0>(occ);
memcpy(entriesChars + offsetEntries, &tid, sizeof(unsigned int));
offsetEntries += sizeof(unsigned int);
noComponents = get<1>(occ)->GetNoComponents();
memcpy(entriesChars + offsetEntries, &noComponents, sizeof(unsigned int));
offsetEntries += sizeof(unsigned int);
for (unsigned int j = 0; j < noComponents; j++) {
get<1>(occ)->Get(j, iv);
start = iv.start.ToDouble();
end = iv.end.ToDouble();
memcpy(entriesChars + offsetEntries, &start, sizeof(double));
offsetEntries += sizeof(double);
memcpy(entriesChars + offsetEntries, &iv.lc, sizeof(bool));
offsetEntries += sizeof(bool);
memcpy(entriesChars + offsetEntries, &end, sizeof(double));
offsetEntries += sizeof(double);
memcpy(entriesChars + offsetEntries, &iv.rc, sizeof(bool));
offsetEntries += sizeof(bool);
}
double coords[] = {get<2>(occ).MinD(0), get<2>(occ).MinD(1),
get<2>(occ).MaxD(0), get<2>(occ).MaxD(1)};
for (int c = 0; c < 4; c++) {
memcpy(entriesChars + offsetEntries, &coords[c], sizeof(double));
offsetEntries += sizeof(double);
}
bool isdefined = get<2>(occ).IsDefined();
memcpy(entriesChars + offsetEntries, &isdefined, sizeof(bool));
offsetEntries += sizeof(bool);
}
for (auto occPos : agg->entries[i].occsPos) {
tid = occPos;
memcpy(entriesChars + offsetEntries, &tid, sizeof(unsigned int));
offsetEntries += sizeof(unsigned int);
}
}
if (!valueRecord.Write(entriesChars, entriesSize, offset)) {
return false;
}
offset += entriesSize;
delete[] entriesChars;
unsigned long long int freqLabelsSize = agg->computeFreqLabelsSize();
cout << "size of freqLabels is " << freqLabelsSize << endl;
if (!valueRecord.Write(&freqLabelsSize, sizeof(unsigned long long int),
offset)) {
return false;
}
offset += sizeof(unsigned long long int);
char* freqLabelsChars = new char[freqLabelsSize];
size_t offsetFreqLabels = 0;
for (unsigned int i = 0; i < noAggEntries; i++) {
label = agg->freqLabels[i];
labelLength = label.length();
memcpy(freqLabelsChars + offsetFreqLabels, &labelLength,
sizeof(unsigned int));
offsetFreqLabels += sizeof(unsigned int);
char labelArray[labelLength + 1];
strcpy(labelArray, label.c_str());
memcpy(freqLabelsChars + offsetFreqLabels, &labelArray, labelLength + 1);
offsetFreqLabels += labelLength + 1;
}
if (!valueRecord.Write(freqLabelsChars, freqLabelsSize, offset)) {
return false;
}
offset += freqLabelsSize;
delete[] freqLabelsChars;
return true;
}
bool RelAgg::readFromRecord(RelAgg *agg, SmiRecord& valueRecord,
size_t& offset) {
// auto measureStart = high_resolution_clock::now();
unsigned int noAggEntries, noOccs, noComponents, occPos, labelLength;
double durD, start, end;
Periods *per;
SecInterval iv(true);
iv.start.SetType(instanttype);
iv.end.SetType(instanttype);
// read ~noTuples~
if (!valueRecord.Read(&agg->noTuples, sizeof(unsigned int), offset)) {
return false;
}
offset += sizeof(unsigned int);
unsigned long long int entriesSize;
if (!valueRecord.Read(&entriesSize, sizeof(unsigned long long int), offset)) {
return false;
}
offset += sizeof(unsigned long long int);
// cout << "size of entries is " << entriesSize << endl;
char* entriesChars = new char[entriesSize];
// read ~entries~
if (!valueRecord.Read(&noAggEntries, sizeof(unsigned int), offset)) {
return false;
}
offset += sizeof(unsigned int);
if (!valueRecord.Read(entriesChars, entriesSize, offset)) {
return false;
}
offset += entriesSize;
size_t offsetEntries = 0;
for (unsigned int i = 0; i < noAggEntries; i++) {
AggEntry entry;
memcpy(&entry.noOccs, entriesChars + offsetEntries, sizeof(unsigned int));
offsetEntries += sizeof(unsigned int);
// cout << "noOccs = " << entry.noOccs << endl;
memcpy(&durD, entriesChars + offsetEntries, sizeof(double));
offsetEntries += sizeof(double);
// offsetEntries += sizeof(double);
entry.duration.ReadFrom(durD);
// cout << "duration is " << entry.duration << endl;
memcpy(&noOccs, entriesChars + offsetEntries, sizeof(unsigned int));
offsetEntries += sizeof(unsigned int);
TupleId tid;
for (unsigned int j = 0; j < noOccs; j++) {
memcpy(&tid, entriesChars + offsetEntries, sizeof(unsigned int));
offsetEntries += sizeof(unsigned int);
memcpy(&noComponents, entriesChars + offsetEntries, sizeof(unsigned int));
offsetEntries += sizeof(unsigned int);
per = new Periods(true);
for (unsigned int k = 0; k < noComponents; k++) {
memcpy(&start, entriesChars + offsetEntries, sizeof(double));
offsetEntries += sizeof(double);
iv.start.ReadFrom(start);
memcpy(&iv.lc, entriesChars + offsetEntries, sizeof(bool));
offsetEntries += sizeof(bool);
memcpy(&end, entriesChars + offsetEntries, sizeof(double));
offsetEntries += sizeof(double);
iv.end.ReadFrom(end);
memcpy(&iv.rc, entriesChars + offsetEntries, sizeof(bool));
offsetEntries += sizeof(bool);
per->MergeAdd(iv);
}
double *min = new double[2];
for (int c = 0; c < 2; c++) {
memcpy(&min[c], entriesChars + offsetEntries, sizeof(double));
offsetEntries += sizeof(double);
}
double *max = new double[2];
for (int c = 0; c < 2; c++) {
memcpy(&max[c], entriesChars + offsetEntries, sizeof(double));
offsetEntries += sizeof(double);
}
bool isdefined;
memcpy(&isdefined, entriesChars + offsetEntries, sizeof(bool));
offsetEntries += sizeof(bool);
Rect rect(isdefined, min, max);
delete[] min;
delete[] max;
entry.occs.push_back(make_tuple(tid, per, rect));
}
for (unsigned int j = 0; j <= agg->noTuples; j++) {
memcpy(&occPos, entriesChars + offsetEntries, sizeof(unsigned int));
offsetEntries += sizeof(unsigned int);
entry.occsPos.push_back(occPos);
// cout << "... pushed back occPos " << occPos << endl;
}
agg->entries.push_back(entry);
}
delete[] entriesChars;
// read ~freqLabels~
unsigned long long int freqLabelsSize = 0;
if (!valueRecord.Read(&freqLabelsSize, sizeof(unsigned long long int),
offset)) {
return false;
}
offset += sizeof(unsigned long long int);
char* freqLabelsChars = new char[freqLabelsSize];
size_t offsetFreqLabels = 0;
if (!valueRecord.Read(freqLabelsChars, freqLabelsSize, offset)) {
return false;
}
offset += freqLabelsSize;
for (unsigned int i = 0; i < noAggEntries; i++) {
memcpy(&labelLength, freqLabelsChars + offsetFreqLabels,
sizeof(unsigned int));
offsetFreqLabels += sizeof(unsigned int);
char labelArray[labelLength + 1];
memcpy(&labelArray, freqLabelsChars + offsetFreqLabels, labelLength + 1);
offsetFreqLabels += labelLength + 1;
string label(labelArray);
agg->labelPos.insert(make_pair(label, agg->freqLabels.size()));
agg->freqLabels.push_back(label);
}
delete[] freqLabelsChars;
// auto measureStop = high_resolution_clock::now();
// double ms =
// (double)(duration_cast<milliseconds>(measureStop - measureStart).count());
// cout << "OPEN finished after " << ms << " ms" << endl;
return true;
}
/*
Class ~RelAgg~, function ~getLabelSeqFromMLabel~
*/
void RelAgg::getLabelSeqFromMLabel(MLabel *ml, vector<unsigned int>& result) {
string label, lastLabel("undefined");
result.clear();
auto it = labelPos.begin();
for (int j = 0; j < ml->GetNoComponents(); j++) {
ml->GetValue(j, label);
it = labelPos.find(label);
if (it != labelPos.end() && label != lastLabel) { // consider only freq lbs
result.push_back(it->second);
lastLabel = label;
}
}
}
/*
Class ~RelAgg~, Function ~insertLabel~
Insert new labels into map structure, update structure for existing labels
*/
void RelAgg::insertLabelAndBbox(const std::string& label, const TupleId& id,
const temporalalgebra::SecInterval& iv, Rect& rect) {
// cout << "insert (" << label << ", " << id << ", " << iv << ")" << endl;
if (label == "undefined") {
return;
}
auto aggIt = entriesMap.find(label);
if (aggIt == entriesMap.end()) { // new label
AggEntry entry(id, iv, rect, rel->GetNoTuples());
entriesMap.insert(make_pair(label, entry));
}
else { // label already present
if (aggIt->second.occsPos[id] == UINT_MAX) { // new id for label
Periods *per = new Periods(1);
per->Add(iv);
entriesMap[label].occsPos[id] = entriesMap[label].occs.size();
entriesMap[label].occs.push_back(make_tuple(id, per, rect));
}
else { // id already present for label
get<1>(entriesMap[label].occs[entriesMap[label].occsPos[id]])->
MergeAdd(iv);
if (rect.IsDefined()) {
get<2>(entriesMap[label].occs[entriesMap[label].occsPos[id]]) =
get<2>(entriesMap[label].occs[entriesMap[label].occsPos[id]]).Union(rect);
}
}
}
entriesMap[label].noOccs++;
entriesMap[label].duration += iv.end - iv.start;
}
/*
Class ~RelAgg~, Function ~scanRelation~
Scan relation, call ~insert~ for all labels of mlabel attribute
*/
void RelAgg::scanRelation(Relation *r, const NewPair<int, int> ap, Geoid *g) {
rel = r;
attrPos = ap;
geoid = g;
string label;
SecInterval iv(true);
noTuples = rel->GetNoTuples();
GenericRelationIterator* it = rel->MakeScan();
MLabel *ml = 0;
MPoint *mp = 0;
MPoint mpPart(true);
Tuple *tuple = 0;
Periods per(true);
Rect rect(true);
while ((tuple = it->GetNextTuple())) {
ml = (MLabel*)(tuple->GetAttribute(attrPos.first));
mp = (MPoint*)(tuple->GetAttribute(attrPos.second));
for (int j = 0; j < ml->GetNoComponents(); j++) {
ml->GetValue(j, label);
ml->GetInterval(j, iv);
per.Add(iv);
mp->AtPeriods(per, mpPart);
rect = mpPart.BoundingBoxSpatial();
per.Clear();
insertLabelAndBbox(label, tuple->GetTupleId(), iv, rect);
}
tuple->DeleteIfAllowed();
}
delete it;
}
/*
Class ~RelAgg~, Function ~filter~
Filter contents; keep only labels with supp >= minSupp
*/
void RelAgg::filter(const double ms, const size_t memSize) {
minSupp = ms;
double supp = 1.0;
// scan ~entriesMap~; push entries for frequent labels into ~entries~ and
// store every label and its entry's position inside ~entries~ in ~inv~
for (auto it : entriesMap) {
supp = double(it.second.occs.size()) / noTuples;
if (supp >= minSupp) {
// cout << "INSERTED: \"" << it.first << "\", POS " << entries.size()
// << " " << it.second.print() << endl;
entries.push_back(it.second);
labelPos.insert(make_pair(it.first, freqLabels.size()));
freqLabels.push_back(it.first);
}
else {
it.second.deletePeriods();
}
}
// for (unsigned int i = 0; i < freqLabels.size(); i++) {
// cout << "<" << i << " : " << freqLabels[i] << "> ";
// }
// cout << endl << endl;
// for (auto it : labelPos) {
// cout << it.first << " |---> " << it.second << " ";
// }
// cout << endl;
}
/*
Class ~RelAgg~, Function ~buildAtom~
Build a string representing a pattern atom from an entry of ~contents~ and
compute its support
*/
bool RelAgg::buildAtom(unsigned int label, AggEntry entry,
const set<TupleId>& commonTupleIds, string& atom) {
SecInterval iv(true);
string timeSpec, semanticTimeSpec;
entry.computeCommonTimeInterval(commonTupleIds, iv);
entry.computeSemanticTimeSpec(commonTupleIds, semanticTimeSpec);
if (!semanticTimeSpec.empty()) {
if (iv.start.IsDefined() && iv.end.IsDefined()) {
timeSpec = "{" + iv.start.ToString() + "~" + iv.end.ToString() + ", "
+ semanticTimeSpec + "}";
}
else {
timeSpec = "{" + semanticTimeSpec + "}";
}
}
else {
if (iv.start.IsDefined() && iv.end.IsDefined()) {
timeSpec = iv.start.ToString() + "~" + iv.end.ToString();
}
else {
atom.clear();
return false;
}
}
Rect rect(true);
entry.computeCommonRect(iv, commonTupleIds, geoid, rect);
atom = "(" + timeSpec + " \"" + freqLabels[label] + "\" " + entry.print(rect)
+ ")";
return true;
}
void RelAgg::subsetperm(vector<unsigned int> source, int left, int index,
vector<unsigned int>& labelVec, set<vector<unsigned int> >& result) {
if (left == 0) {
do {
result.insert(labelVec);
} while (std::next_permutation(labelVec.begin(), labelVec.end()));
return;
}
for (unsigned int i = index; i < source.size(); i++) {
labelVec.push_back(source[i]);
subsetperm(source, left - 1, i + 1, labelVec, result);
labelVec.pop_back();
}
}
void RelAgg::subset(vector<unsigned int> source, int left, int index,
vector<unsigned int>& labelVec, set<vector<unsigned int> >& result) {
if (left == 0) {
// if (nonfreqSets[labelVec.size()].find(labelVec) ==
// nonfreqSets[labelVec.size()].end()) {
result.insert(labelVec);
// }
}
for (unsigned int i = index; i < source.size(); i++) {
labelVec.push_back(source[i]);
subset(source, left - 1, i + 1, labelVec, result);
labelVec.pop_back();
}
}
/*
Class ~RelAgg~, Function ~retrieveLabelSets~
Computes all label combinations of size ~size~ for a tuple
*/
void RelAgg::retrieveLabelCombs(const unsigned int size,
vector<unsigned int>& source, set<vector<unsigned int> >& result) {
result.clear();
vector<unsigned int> labelVec;
subsetperm(source, size, 0, labelVec, result);
}
/*
Class ~RelAgg~, Function ~retrieveLabelSubsets~
*/
void RelAgg::retrieveLabelSubsets(const unsigned int size,
vector<unsigned int>& source, set<vector<unsigned int> >& result) {
result.clear();
vector<unsigned int> labelVec;
subset(source, size, 0, labelVec, result);
}
double RelAgg::getSupp(unsigned int label) {
return double(entries[label].occs.size()) / noTuples;
}
/*
Class ~RelAgg~, Function ~canIntersectionBeFrequent~
Computes the fraction of tuples in which all strings of ~labelSeq~ occur
*/
bool RelAgg::canLabelsBeFrequent(vector<unsigned int>& labelSeq,
set<TupleId>& intersection) {
intersection.clear();
if (labelSeq.size() < 2) {
cout << "sequence has only " << labelSeq.size() << " component(s)" << endl;
return false;
}
set<TupleId> intersection_temp;
vector<set<TupleId> > allOccs;
allOccs.resize(labelSeq.size());
// retrieve occurrences for every label
// cout << "check sequence " << print(labelSeq) << endl;
for (unsigned int pos = 0; pos < labelSeq.size(); pos++) {
for (auto occ : entries[labelSeq[pos]].occs) {
allOccs[pos].insert(get<0>(occ));
}
}
// compute intersection of all id sets
set_intersection(allOccs[0].begin(), allOccs[0].end(), allOccs[1].begin(),
allOccs[1].end(), inserter(intersection, intersection.begin()));
for (unsigned int pos = 2; pos < labelSeq.size(); pos++) {
set_intersection(intersection.begin(), intersection.end(),
allOccs[pos].begin(), allOccs[pos].end(),
inserter(intersection_temp, intersection_temp.begin()));
intersection = intersection_temp;
intersection_temp.clear();
}
// check support of intersection; if it is below ~minSupp~, there is no chance
// for a frequent (k+1)-pattern
// cout << "support of " << print(intersection) << " equals "
// << double(intersection.size()) / noTuples << " ==> "
// << (double(intersection.size()) / noTuples >= minSupp) << endl;
return (double(intersection.size()) / noTuples >= minSupp);
}
/*
Class ~RelAgg~, Function ~sequenceSupp~
Computes the support for a given sequence of labels
*/
double RelAgg::sequenceSupp(vector<unsigned int> labelSeq,
set<TupleId> intersection) {
if (labelSeq.empty()) {
return 0.0;
}
Instant start(instanttype), end(instanttype);
int noOccurrences = 0;
AggEntry *entry;
for (auto id : intersection) {
// try to find all labels
start.ToMinimum();
end.ToMaximum();
bool sequenceFound = true;
unsigned int pos = 0;
while (sequenceFound && (pos < labelSeq.size())) {
entry = &(entries[labelSeq[pos]]);
if (entry->occsPos[id] < UINT_MAX) {
get<1>(entry->occs[entry->occsPos[id]])->Maximum(end);
if (start < end) { // label found, correct order
// set start instant to begin of periods for current label
get<1>(entry->occs[entry->occsPos[id]])->Minimum(start);
}
else { // label found, but not in expected order
// cout << "WRONG ORDER: id " << id << ", \"" << labelSeq[pos]
// << "\" NOT after \"" << labelSeq[pos-1] << "\"" << endl;
sequenceFound = false;
}
}
else { // label not found in corresponding tuple
// cout << "NOT FOUND: id " << id << ", \"" << labelSeq[pos] << endl;
sequenceFound = false;
}
pos++;
}
if (sequenceFound) {
// cout << "SEQUENCE FOUND: id " << id << ", " << print(labelSeq) << endl;
noOccurrences++;
}
}
return double(noOccurrences) / noTuples;
}
/*
Class ~RelAgg~, Function ~combineApriori~
Combine sets of $k$ frequent labels to sets of $k+1$ labels, similarly to
Apriori algorithm, e.g. {a,b,c} combined with {a,b,d} yields {a,b,c,d},
or {a,b,c} combined with {e,c,b} yields {
*/
void RelAgg::combineApriori(set<vector<unsigned int> >& frequentLabelCombs,
set<vector<unsigned int> >& labelCombs) {
if (frequentLabelCombs.empty()) {
return;
}
if ((frequentLabelCombs.begin())->empty()) {
return;
}
// cout << "Frequent label combs for k = 2:" << endl;
// for (auto it : frequentLabelCombs) {
// cout << print(it) << endl;
// }
// cout << endl;
unsigned int k = (frequentLabelCombs.begin())->size();
set<vector<unsigned int> >::iterator it2;
set<unsigned int> union_k_inc;
for (set<vector<unsigned int> >::iterator it1 = frequentLabelCombs.begin();
it1 != frequentLabelCombs.end(); it1++) {
it2 = it1;
it2++;
while (it2 != frequentLabelCombs.end()) {
set_union(it1->begin(), it1->end(), it2->begin(), it2->end(),
inserter(union_k_inc, union_k_inc.begin()));
if (union_k_inc.size() == k+1) {
// cout << print(*it1) << " and " << print(*it2) << " united to "
// << print(union_k_inc) << endl;
vector<unsigned int> unionvec(union_k_inc.begin(), union_k_inc.end());
labelCombs.insert(labelCombs.end(), unionvec);
}
union_k_inc.clear();
it2++;
}
union_k_inc.clear();
}
}
void RelAgg::retrievePermutations(vector<unsigned int>& labelComb,
set<vector<unsigned int> >& labelPerms) {
labelPerms.clear();
vector<unsigned int> labels = labelComb;
std::sort(labels.begin(), labels.end());
do {
labelPerms.insert(labelPerms.end(), labels);
} while (std::next_permutation(labels.begin(), labels.end()));
}
/*
Class ~RelAgg~, Function ~derivePatterns~
Scan sorted representation in order to retrieve patterns
*/
void RelAgg::derivePatterns(const int mina, const int maxa) {
minNoAtoms = mina;
maxNoAtoms = maxa;
// retrieve patterns with one atom, ~entries~ guaranteed to fulfill minSupp
string pattern, atom;
set<TupleId> commonTupleIds;
double supp = 1.0;
for (unsigned int label = 0; label < entries.size(); label++) {
buildAtom(label, entries[label], commonTupleIds, atom);
if (minNoAtoms == 1) {
supp = double(entries[label].occs.size()) / noTuples;
results.push_back(NewPair<string, double>(atom, supp));
}
}
cout << freqLabels.size() << " frequent 1-patterns found" << endl;
// retrieve patterns with two atoms
if (maxNoAtoms < 2) {
return;
}
set<vector<unsigned int> > labelCombs, frequentLabelCombs, labelPerms;
SecInterval iv(true);
// scan ~contents~; only atoms whose corresponding 1-patterns fulfill
// ~minSupp~ can be part of a frequent 2-pattern
vector<unsigned int> frequentLabels;
for (unsigned int label = 0; label < freqLabels.size(); label++) {
frequentLabels.push_back(label);
}
retrieveLabelCombs(2, frequentLabels, labelPerms);
// check all combinations for their support
bool correct = false;
for (auto labelPerm : labelPerms) {
if (canLabelsBeFrequent(labelPerm, commonTupleIds)) {
supp = sequenceSupp(labelPerm, commonTupleIds);
if (supp >= minSupp) {
if (minNoAtoms <= 2) {
pattern.clear();
// build complete 2-pattern
correct = buildAtom(labelPerm[0], entries[labelPerm[0]],
commonTupleIds, atom);
pattern += atom + " ";
correct = correct && buildAtom(labelPerm[1], entries[labelPerm[1]],
commonTupleIds, atom);
pattern += atom;
if (correct) {
results.push_back(NewPair<string, double>(pattern, supp));
}
}
frequentLabelCombs.insert(frequentLabelCombs.end(), labelPerm);
}
}
}
cout << frequentLabelCombs.size() << " frequent 2-patterns found" << endl;
// retrieve patterns with three or more atoms
unsigned int k = 3;
while (k <= maxNoAtoms && !frequentLabelCombs.empty()) { // no frequent k-pat
map<vector<unsigned int>, set<TupleId> > labelPermsWithCommonIds;
labelCombs.clear();
combineApriori(frequentLabelCombs, labelCombs);
frequentLabelCombs.clear();
for (auto labelComb : labelCombs) {
if (canLabelsBeFrequent(labelComb, commonTupleIds)) {
// cout << print(labelComb) << " can be frequent; occurs in "
// << print(commonTupleIds) << endl;
retrievePermutations(labelComb, labelPerms);
for (auto labelPerm : labelPerms) {
labelPermsWithCommonIds[labelPerm] = commonTupleIds;
}
}
}
for (auto it : labelPermsWithCommonIds) {
// cout << print(it.first) << " occurs in " << print(it.second);
supp = sequenceSupp(it.first, it.second);
// cout << ", supp is " << supp << endl;
if (supp >= minSupp) {
if (k >= minNoAtoms) {
pattern.clear();
correct = true;
unsigned int pos = 0;
while (correct && (pos < it.first.size())) {
correct = correct && buildAtom(it.first[pos],
entries[it.first[pos]], it.second, atom);
pattern += atom + " ";
pos++;
}
if (correct) {
results.push_back(NewPair<string, double>(pattern, supp));
}
}
frequentLabelCombs.insert(frequentLabelCombs.end(), it.first);
// cout << "k = " << k << "; sequence " << print(it.first)
// << " inserted" << endl;
}
}
cout << frequentLabelCombs.size() << " frequent " << k
<< "-patterns found" << endl;
k++;
}
std::sort(results.begin(), results.end(), comparePMResults());
}
/*
Class ~RelAgg~, Function ~computeEntriesSize~
Compute storage space in bytes for all entries:
constant: noOccs, (noTuples + 1) * tid, noOccs, duration
in every occ: tid, per->noComponents * (start, lc, end, rc),
min(0,1), max(0,1), isdefined
*/
unsigned long long int RelAgg::computeEntriesSize() const {
unsigned long long int constEntrySize = sizeof(unsigned int) +
(noTuples + 1) * sizeof(unsigned int) +
sizeof(unsigned int) + sizeof(double);
unsigned long long int result = entries.size() * constEntrySize;
for (auto entry : entries) {
for (auto occ : entry.occs) {
result += sizeof(unsigned int) + sizeof(unsigned int) +
get<1>(occ)->GetNoComponents() * (2 * (sizeof(double) + sizeof(bool)))
+ 4 * sizeof(double) + sizeof(bool);
}
}
return result;
}
/*
Class ~RelAgg~, Function ~computeFreqLabelsSize~
Compute storage space in bytes for ~freqLabels~:
size + size * (wordlength + word)
*/
unsigned long long int RelAgg::computeFreqLabelsSize() const {
unsigned long long int result = 0;
for (auto label : freqLabels) {
result += label.size() + 1 + sizeof(unsigned int);
}
return result;
}
string RelAgg::print(const map<unsigned int, AggEntry>& contents) const {
stringstream result;
for (auto it : contents) {
result << "\"" << freqLabels[it.first] << "\" occurs "
<< it.second.noOccs << " times with a total duration of "
<< it.second.duration << endl << " " << it.second.print()
<< endl << "-----------------------------------------------" << endl;
}
return result.str();
}
string RelAgg::print(const map<TupleId,vector<unsigned int> >& frequentLabels)
const {
stringstream result;
for (auto it : frequentLabels) {
result << "TID " << it.first << ": ";
for (auto it2 : it.second) {
result << "\"" << freqLabels[it2] << "\" ";
}
result << endl;
}
return result.str();
}
string RelAgg::print(const set<vector<unsigned int> >& labelCombs) const {
stringstream result;
result << "{" << endl;
for (auto it : labelCombs) {
result << " " << print(it);
}
result << "}" << endl;
return result.str();
}
string RelAgg::print(const unsigned int label /* = UINT_MAX */) {
stringstream result;
if (label == UINT_MAX) { // print everything
for (unsigned int l = 0; l < freqLabels.size(); l++) {
result << "\"" << freqLabels[label] << "\" :" << entries[label].print()
<< endl << "---------------------------------------------" << endl;
}
}
else {
if (entries[label].occs.empty()) { // label not found
result << "Label \"" << freqLabels[label] << "\" not found" << endl;
}
else {
result << "\"" << freqLabels[label] << "\" :" << entries[label].print()
<< endl;
}
}
return result.str();
}
/*
Class ~FPNode~, Function ~toListExpr~
*/
ListExpr FPNode::toListExpr(vector<string>& freqLabels) const {
ListExpr childrenList = nl->Empty();
if (!children.empty()) {
childrenList = nl->OneElemList(nl->IntAtom(children[0]));
}
ListExpr childList = childrenList;
for (unsigned int i = 1; i < children.size(); i++) {
childList = nl->Append(childList, nl->IntAtom(children[i]));
}
string lb = (label < UINT_MAX ? freqLabels[label] : "<ROOT>");
return nl->FiveElemList(nl->SymbolAtom(lb),
nl->IntAtom(frequency), childrenList,
nl->IntAtom(nodeLink), nl->IntAtom(ancestor));
}
/*
Class ~FPTree~, Function ~isChildOf~
*/
bool FPTree::isChildOf(unsigned int label, unsigned int pos,
unsigned int& nextPos) {
for (auto it : nodes[pos].children) {
if (nodes[it].label == label) {
nextPos = it;
return true;
}
}
nextPos = UINT_MAX;
return false;
}
/*
Class ~FPTree~, Function ~updateNodeLink~
*/
void FPTree::updateNodeLink(unsigned int label, unsigned int targetPos) {
map<unsigned int, unsigned int>::iterator it = nodeLinks.find(label);
if (it == nodeLinks.end()) { // no existing node link
nodeLinks.insert(make_pair(label, targetPos));
}
else { // node link for label exists
unsigned int link = it->second;
unsigned int currentPos = 0;
while (link != 0) { // find end of node link
currentPos = link;
link = nodes[link].nodeLink;
}
nodes[currentPos].nodeLink = targetPos;
}
}
/*
Class ~FPTree~, Function ~insertLabelVector~
*/
void FPTree::insertLabelVector(const vector<unsigned int>& labelsOrdered,
const unsigned int freq) {
// cout << "insert: | ";
// for (auto it : labelsOrdered) {
// cout << it << " | ";
// }
// cout << endl;
unsigned int nodePos(0), nextPos(0);
for (auto label : labelsOrdered) {
if (isChildOf(label, nodePos, nextPos)) {
nodes[nextPos].frequency += freq;
// cout << " \"" << label << "\" is child of \"" << nodes[nodePos].label
// << "\", frequency = " << nodes[nextPos].frequency << endl;
nodePos = nextPos;
}
else {
FPNode node(label, freq, nodePos);
nodes.push_back(node);
nodes[nodePos].children.push_back(nodes.size() - 1);
updateNodeLink(label, nodes.size() - 1);
// cout << " new node for \"" << label << "\" at pos "
// << nodes.size() - 1 << ", now child of " << nodePos << endl;
nodePos = nodes.size() - 1;
}
}
// cout << " ... SUCCESSFULLY inserted" << endl;
}
/*
Class ~FPTree~, Function ~construct~
*/
void FPTree::construct() {
GenericRelationIterator* it = agg->rel->MakeScan();
MLabel *ml = 0;
Tuple *tuple = 0;
string label;
set<NewPair<unsigned int, double>, compareLabelsWithSupp> labelsWithSupp;
vector<unsigned int> labelsOrdered;
while ((tuple = it->GetNextTuple())) {
labelsWithSupp.clear();
labelsOrdered.clear();
ml = (MLabel*)(tuple->GetAttribute(agg->attrPos.first));
for (int j = 0; j < ml->GetNoComponents(); j++) {
ml->GetValue(j, label);
unsigned int labelPos = agg->labelPos[label];
NewPair<unsigned int, double> labelWithSupp(labelPos,
agg->getSupp(labelPos));
if (labelWithSupp.second >= minSupp) {
labelsWithSupp.insert(labelWithSupp);
}
}
for (auto itl : labelsWithSupp) {
labelsOrdered.push_back(itl.first);
}
insertLabelVector(labelsOrdered, 1);
tuple->DeleteIfAllowed();
}
delete it;
}
/*
Class ~FPTree~, Function ~initialize~
*/
void FPTree::initialize(const double ms, RelAgg *ra) {
minSupp = ms;
FPNode node(UINT_MAX, 0, 0);
nodes.push_back(node); // create dummy node for root
agg = ra;
minSuppCnt = (unsigned int)std::ceil(minSupp * ra->noTuples);
agg->checkedSeqs.resize(agg->maxNoAtoms + 1);
}
/*
Class ~FPTree~, Function ~isOnePathTree~
*/
bool FPTree::isOnePathTree() {
for (unsigned int i = 0; i < getNoNodes(); i++) {
if (i < getNoNodes() - 1) { // inner node
if (nodes[i].children.size() != 1) {
return false; // inner nodes have more or less than one child
}
if (nodes[i].children[0] != i+1) {
return false; // wrong position of child
}
if (nodes[i].nodeLink != 0) {
return false; // node link exists
}
}
if (i == getNoNodes() - 1 && nodes[i].children.size() > 0) {
return false; // leaf node has childs (ERROR)
}
}
return true;
}
/*
Class ~FPTree~, Function ~sortNodeLinks~
*/
void FPTree::sortNodeLinks(vector<unsigned int>& result) {
result.clear();
set<NewPair<unsigned int, double>, compareLabelsWithSupp> labelsWithSupp;
for (auto it : nodeLinks) {
labelsWithSupp.insert(NewPair<unsigned int, double>(it.first,
agg->getSupp(it.first)));
}
for (set<NewPair<unsigned int, double>,
compareLabelsWithSupp>::reverse_iterator it = labelsWithSupp.rbegin();
it != labelsWithSupp.rend(); ++it) {
result.push_back(it->first);
}
}
/*
Class ~FPTree~, Function ~collectPatternsFromSeq~
*/
void FPTree::collectPatternsFromSeq(vector<unsigned int>& labelSeq,
const unsigned int minNoAtoms, const unsigned int maxNoAtoms) {
set<vector<unsigned int> > labelSubsets, labelPerms;
unsigned int minSetSize = max(minNoAtoms, (unsigned int)2);
set<TupleId> commonTupleIds;
string atom, pattern;
double supp;
// find all subsets of label sequence, having a suitable number of elements
unsigned int setSize = minSetSize;
unsigned oldResultSize = 0;
bool freqkPatFound = true;
while (setSize <= maxNoAtoms && freqkPatFound) {
agg->retrieveLabelSubsets(setSize, labelSeq, labelSubsets);
for (auto subset : labelSubsets) {
if (agg->nonfreqSets[setSize].find(subset) ==
agg->nonfreqSets[setSize].end()) {
bool isSetFreq = agg->freqSets[setSize].find(subset) !=
agg->freqSets[setSize].end();
if (!isSetFreq) {
isSetFreq = agg->canLabelsBeFrequent(subset, commonTupleIds);
}
if (isSetFreq) {
agg->freqSets[setSize].insert(subset);
labelPerms.clear();
do { // process all unchecked permutations of ~subset~
if (agg->checkedSeqs[setSize].find(subset) ==
agg->checkedSeqs[setSize].end()) {
supp = agg->sequenceSupp(subset, commonTupleIds);
if (supp >= minSupp) {
for (unsigned int i = 0; i < subset.size(); i++) {
if (!agg->buildAtom(subset[i], agg->entries[subset[i]],
commonTupleIds, atom)) {
cout << "Error in buildAtom for " << subset[i] << endl;
return;
}
pattern += atom + " ";
}
agg->results.push_back(NewPair<string, double>(pattern, supp));
pattern.clear();
}
agg->checkedSeqs[setSize].insert(subset);
}
} while (std::next_permutation(subset.begin(), subset.end()));
}
else {
agg->nonfreqSets[setSize].insert(subset);
}
}
}
freqkPatFound = (agg->results.size() > oldResultSize);
oldResultSize = agg->results.size();
setSize++;
}
}
/*
Class ~FPTree~, Function ~computeReducedCondBase~
*/
void FPTree::computeCondPatternBase(vector<unsigned int>& labelSeq,
vector<NewPair<vector<unsigned int>, unsigned int> >& result) {
result.clear();
NewPair<vector<unsigned int>, unsigned int> labelPathWithSuppCnt;
unsigned int link = nodeLinks[*(labelSeq.rbegin())];
unsigned int anc, freq;
while (link != 0) {
anc = nodes[link].ancestor;
freq = nodes[link].frequency;
while (anc != 0) { // retrieve whole branch above ~label~ node
labelPathWithSuppCnt.first.push_back(nodes[anc].label);
anc = nodes[anc].ancestor;
}
labelPathWithSuppCnt.second = freq;
if (!labelPathWithSuppCnt.first.empty()) {
std::reverse(labelPathWithSuppCnt.first.begin(),
labelPathWithSuppCnt.first.end());
result.push_back(labelPathWithSuppCnt);
labelPathWithSuppCnt.first.clear();
}
link = nodes[link].nodeLink;
}
}
/*
Class ~FPTree~, Function ~constructCondTree~
*/
FPTree* FPTree::constructCondTree(
vector<NewPair<vector<unsigned int>, unsigned int> >& condPB) {
if (condPB.empty()) {
return 0;
}
FPTree *condFPTree = new FPTree();
// cout << "new tree created... " << endl;
condFPTree->initialize(minSupp, agg);
map<unsigned int, unsigned int> labelsToSuppCnt;
map<unsigned int, unsigned int>::iterator mapIt;
// build map: label --> suppCnt
for (auto labelSeqWithSuppCnt : condPB) {
for (auto label : labelSeqWithSuppCnt.first) {
mapIt = labelsToSuppCnt.find(label);
if (mapIt != labelsToSuppCnt.end()) { // label found; increase suppCnt
mapIt->second += labelSeqWithSuppCnt.second;
}
else { // label not found; insert
labelsToSuppCnt[label] = labelSeqWithSuppCnt.second;
}
}
}
// keep only labels having suppCnt >= minSuppCnt
vector<NewPair<vector<unsigned int>, unsigned int> > freqCondPB;
vector<unsigned int> labelSeq;
for (auto labelSeqWithSuppCnt : condPB) {
for (auto label : labelSeqWithSuppCnt.first) {
if (labelsToSuppCnt[label] >= condFPTree->minSuppCnt) {
labelSeq.push_back(label);
}
}
if (!labelSeq.empty()) {
freqCondPB.push_back(NewPair<vector<unsigned int>, unsigned int>(labelSeq,
labelSeqWithSuppCnt.second));
labelSeq.clear();
}
}
if (freqCondPB.empty()) {
delete condFPTree;
return 0;
}
for (auto it : freqCondPB) {
condFPTree->insertLabelVector(it.first, it.second);
}
// cout << " ... filled, " << condFPTree->getNoNodes() << " nodes" << endl;
return condFPTree;
}
/*
Class ~FPTree~, Function ~mineTree~
*/
void FPTree::mineTree(vector<unsigned int>& initLabels,
const unsigned int minNoAtoms, const unsigned int maxNoAtoms) {
if (!hasNodes()) {
return;
}
if (isOnePathTree()) {
// cout << " tree has ONE path, " << nodes.size() - 1 << " node(s) : <";
// for (auto it : initLabels) {
// cout << agg->freqLabels[it] << ", ";
// }
// cout << "| ";
set<unsigned int> freqLabels(initLabels.begin(), initLabels.end());
for (unsigned int i = 1; i < nodes.size(); i++) {
freqLabels.insert(nodes[i].label);
// cout << agg->freqLabels[nodes[i].label] << ", ";
}
// cout << ">" << endl;
vector<unsigned int> labels(freqLabels.begin(), freqLabels.end());
collectPatternsFromSeq(labels, minNoAtoms, maxNoAtoms);
// cout << " ... all patterns collected" << endl;
}
else { // tree has more than one path
// cout << "tree has SEVERAL paths" << endl;
vector<unsigned int> labelsSortedByFrequency,
labelSeq(initLabels.begin(), initLabels.end());
sortNodeLinks(labelsSortedByFrequency);
vector<NewPair<vector<unsigned int>, unsigned int> > condPatBase;
for (auto label : labelsSortedByFrequency) {
labelSeq.push_back(label);
if (labelSeq.size() > 1) {
collectPatternsFromSeq(labelSeq, minNoAtoms, maxNoAtoms);
}
computeCondPatternBase(labelSeq, condPatBase);
// cout << "rPB for " << agg->print(labelSeq) << " has "
// << condPatBase.size() << " elems: ";
// for (auto it : condPatBase) {
// cout << "-" << agg->print(it.first) << ",freq=" << it.second << endl;
// }
FPTree *condFPTree = constructCondTree(condPatBase);
if (condFPTree != 0) {
// Word fptval;
// fptval.addr = condFPTree;
// SecondoCatalog* sc = SecondoSystem::GetCatalog();
// cout << nl->ToString(FPTree::Out(sc->NumericType(nl->SymbolAtom(
// BasicType())), fptval)) << endl;
condFPTree->mineTree(labelSeq, minNoAtoms, maxNoAtoms);
delete condFPTree;
}
labelSeq.pop_back();
}
}
}
/*
Class ~FPTree~, Function ~retrievePatterns~
*/
void FPTree::retrievePatterns(const unsigned int minNoAtoms,
const unsigned int maxNoAtoms) {
if (minNoAtoms == 1) {
string pattern, atom;
set<TupleId> commonTupleIds;
vector<unsigned int> frequentLabels;
double supp = 1.0;
for (unsigned int l = 0; l < agg->entries.size(); l++) { // retrieve 1-pats
agg->buildAtom(l, agg->entries[l], commonTupleIds, atom);
supp = double(agg->entries[l].occs.size()) / agg->noTuples;
agg->results.push_back(NewPair<string, double>(atom, supp));
frequentLabels.push_back(l);
}
cout << frequentLabels.size() << " frequent 1-patterns found" << endl;
}
vector<unsigned int> initialLabels;
agg->checkedSeqs.resize(maxNoAtoms + 1);
agg->freqSets.resize(maxNoAtoms + 1);
agg->nonfreqSets.resize(maxNoAtoms + 1);
mineTree(initialLabels, minNoAtoms, maxNoAtoms);
std::sort(agg->results.begin(), agg->results.end(), comparePMResults());
}
/*
Class ~FPTree~, Function ~computeNodesSize~
Compute storage space in bytes for ~Nodes~:
noNodes + noNodes * (label + freq + noChildren + noChildren * child + nodeLink
+ anc)
*/
unsigned long long int FPTree::computeNodesSize() const {
unsigned long long int result = sizeof(unsigned int);
for (auto node : nodes) {
result += (5 + node.children.size()) * sizeof(unsigned int);
}
return result;
}
/*
Class ~FPTree~, Function ~computeNodeLinksSize~
Compute storage space in bytes for ~NodeLinks~:
noNL + noNL * (label + nodePos)
*/
unsigned long long int FPTree::computeNodeLinksSize() const {
unsigned long long int result = sizeof(unsigned int) +
nodeLinks.size() * 2 * sizeof(unsigned int);
return result;
}
/*
Class ~FPTree~, functions for secondo data type
*/
ListExpr FPTree::Property() {
return (nl->TwoElemList(
nl->FourElemList(
nl->StringAtom("Signature"), nl->StringAtom("Example Type List"),
nl->StringAtom("List Rep"), nl->StringAtom("Example List")),
nl->FourElemList (
nl->StringAtom("-> SIMPLE"),
nl->StringAtom(FPTree::BasicType()),
nl->StringAtom("no list representation"),
nl->StringAtom(""))));
}
Word FPTree::In(const ListExpr typeInfo, const ListExpr instance,
const int errorPos, ListExpr& errorInfo, bool& correct) {
correct = false;
return SetWord(Address(0));
}
ListExpr FPTree::getNodeLinksList(unsigned int label) {
unsigned int link = nodeLinks[label];
ListExpr result = nl->OneElemList(nl->IntAtom(link));
ListExpr nodeLinkList = result;
link = nodes[link].nodeLink;
while (link != 0) {
nodeLinkList = nl->Append(nodeLinkList, nl->IntAtom(link));
link = nodes[link].nodeLink;
}
return result;
}
ListExpr FPTree::Out(ListExpr typeInfo, Word value) {
FPTree *tree = (FPTree*)value.addr;
ListExpr nodesList(nl->Empty()), nodeList(nl->Empty()),
nodeLinksList(nl->Empty()), nodeLinkList(nl->Empty());
ListExpr noTuplesList = nl->TwoElemList(nl->SymbolAtom("noTuples"),
nl->IntAtom(tree->agg->noTuples));
ListExpr minSuppList = nl->TwoElemList(nl->SymbolAtom("minSupp"),
nl->RealAtom(tree->minSupp));
if (tree->hasNodes()) {
nodesList = nl->OneElemList(tree->nodes[0].toListExpr
(tree->agg->freqLabels));
nodeList = nodesList;
}
for (unsigned int i = 1; i < tree->nodes.size(); i++) {
nodeList = nl->Append(nodeList, tree->nodes[i].toListExpr
(tree->agg->freqLabels));
}
map<unsigned int, unsigned int>::iterator it = tree->nodeLinks.begin();
if (tree->hasNodeLinks()) {
nodeLinksList = nl->OneElemList(nl->TwoElemList(nl->SymbolAtom
(tree->agg->freqLabels[it->first]), tree->getNodeLinksList(it->first)));
nodeLinkList = nodeLinksList;
}
it++;
while (it != tree->nodeLinks.end()) {
nodeLinkList = nl->Append(nodeLinkList,
nl->TwoElemList(nl->SymbolAtom(tree->agg->freqLabels[it->first]),
tree->getNodeLinksList(it->first)));
it++;
}
return nl->FourElemList(noTuplesList, minSuppList, nodesList, nodeLinksList);
}
Word FPTree::Create(const ListExpr typeInfo) {
Word w;
w.addr = (new FPTree());
return w;
}
void FPTree::Delete(const ListExpr typeInfo, Word& w) {
FPTree *tree = (FPTree*)w.addr;
delete tree;
w.addr = 0;
}
bool FPTree::Save(SmiRecord& valueRecord, size_t& offset,
const ListExpr typeInfo, Word& value) {
FPTree *tree = (FPTree*)value.addr;
// store minSupp
if (!valueRecord.Write(&tree->minSupp, sizeof(double), offset)) {
return false;
}
offset += sizeof(double);
// store noNodes and nodes
unsigned long long int nodesSize = tree->computeNodesSize();
cout << "size of nodes is " << nodesSize << endl;
if (!valueRecord.Write(&nodesSize, sizeof(unsigned long long int), offset)) {
return false;
}
offset += sizeof(unsigned long long int);
char* nodesChars = new char[nodesSize];
unsigned int noNodes = tree->getNoNodes();
size_t offsetNodes = 0;
memcpy(nodesChars + offsetNodes, &noNodes, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
string label;
unsigned int numLabel, frequency, noChildren, child, nodeLink, ancestor;
for (unsigned int i = 0; i < noNodes; i++) { // store nodes
numLabel = tree->nodes[i].label;
memcpy(nodesChars + offsetNodes, &numLabel, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
frequency = tree->nodes[i].frequency;
memcpy(nodesChars + offsetNodes, &frequency, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
noChildren = tree->nodes[i].children.size();
memcpy(nodesChars + offsetNodes, &noChildren, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
for (unsigned int j = 0; j < tree->nodes[i].children.size(); j++) {//childr.
child = tree->nodes[i].children[j];
memcpy(nodesChars + offsetNodes, &child, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
}
nodeLink = tree->nodes[i].nodeLink;
memcpy(nodesChars + offsetNodes, &nodeLink, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
ancestor = tree->nodes[i].ancestor;
memcpy(nodesChars + offsetNodes, &ancestor, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
}
if (!valueRecord.Write(nodesChars, nodesSize, offset)) {
return false;
}
offset += nodesSize;
delete[] nodesChars;
// store noNodeLinks
unsigned long long int nodeLinksSize = tree->computeNodeLinksSize();
cout << "size of nodelinks is " << nodeLinksSize << endl;
if (!valueRecord.Write(&nodeLinksSize, sizeof(unsigned long long int),
offset)) {
return false;
}
offset += sizeof(unsigned long long int);
char* nodeLinksChars = new char[nodeLinksSize];
unsigned int noNodeLinks = tree->getNoNodeLinks();
size_t offsetNodeLinks = 0;
memcpy(nodeLinksChars + offsetNodeLinks, &noNodeLinks, sizeof(unsigned int));
offsetNodeLinks += sizeof(unsigned int);
// store nodeLinks
for (map<unsigned int, unsigned int>::iterator it = tree->nodeLinks.begin();
it != tree->nodeLinks.end(); it++) { // store nodeLinks
numLabel = it->first;
memcpy(nodeLinksChars + offsetNodeLinks, &numLabel, sizeof(unsigned int));
offsetNodeLinks += sizeof(unsigned int);
nodeLink = it->second;
memcpy(nodeLinksChars + offsetNodeLinks, &nodeLink, sizeof(unsigned int));
offsetNodeLinks += sizeof(unsigned int);
}
if (!valueRecord.Write(nodeLinksChars, nodeLinksSize, offset)) {
return false;
}
offset += nodeLinksSize;
delete[] nodeLinksChars;
// store ~noTuples~, ~entries~ and ~freqLabels~ from relAgg
if (RelAgg::saveToRecord(tree->agg, valueRecord, offset)) {
return false;
}
return true;
}
bool FPTree::Open(SmiRecord& valueRecord, size_t& offset,
const ListExpr typeInfo, Word& value) {
FPTree *tree = new FPTree();
// read minSupp
if (!valueRecord.Read(&tree->minSupp, sizeof(double), offset)) {
return false;
}
offset += sizeof(double);
unsigned int numLabel, noNodes, frequency, noChildren, child, nodeLink,
ancestor, noNodeLinks;
// read nodes
unsigned long long int nodesSize;
if (!valueRecord.Read(&nodesSize, sizeof(unsigned long long int), offset)) {
return false;
}
offset += sizeof(unsigned long long int);
// cout << "size of NODES is " << nodesSize << endl;
char* nodesChars = new char[nodesSize];
size_t offsetNodes = 0;
if (!valueRecord.Read(nodesChars, nodesSize, offset)) {
return false;
}
offset += nodesSize;
memcpy(&noNodes, nodesChars + offsetNodes, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
for (unsigned int i = 0; i < noNodes; i++) { // read nodes
memcpy(&numLabel, nodesChars + offsetNodes, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
memcpy(&frequency, nodesChars + offsetNodes, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
memcpy(&noChildren, nodesChars + offsetNodes, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
vector<unsigned int> children;
for (unsigned int j = 0; j < noChildren; j++) {
memcpy(&child, nodesChars + offsetNodes, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
children.push_back(child);
}
memcpy(&nodeLink, nodesChars + offsetNodes, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
memcpy(&ancestor, nodesChars + offsetNodes, sizeof(unsigned int));
offsetNodes += sizeof(unsigned int);
// cout << "create node: " << numLabel << ", " << frequency << ", "
// << children.size() << ", " << nodeLink << ", " << ancestor << endl;
FPNode node(numLabel, frequency, children, nodeLink, ancestor);
tree->nodes.push_back(node);
}
delete[] nodesChars;
// read nodeLinks
unsigned long long int nodeLinksSize = 0;
if (!valueRecord.Read(&nodeLinksSize, sizeof(unsigned long long int),
offset)) {
return false;
}
offset += sizeof(unsigned long long int);
char* nodeLinksChars = new char[nodeLinksSize];
if (!valueRecord.Read(nodeLinksChars, nodeLinksSize, offset)) {
return false;
}
offset += nodeLinksSize;
size_t offsetNodeLinks = 0;
memcpy(&noNodeLinks, nodeLinksChars, sizeof(unsigned int));
offsetNodeLinks += sizeof(unsigned int);
for (unsigned int i = 0; i < noNodeLinks; i++) {
memcpy(&numLabel, nodeLinksChars + offsetNodeLinks, sizeof(unsigned int));
offsetNodeLinks += sizeof(unsigned int);
memcpy(&nodeLink, nodeLinksChars + offsetNodeLinks, sizeof(unsigned int));
offsetNodeLinks += sizeof(unsigned int);
// cout << "create nodeLink: " << numLabel << " --> " << nodeLink << endl;
tree->nodeLinks.insert(tree->nodeLinks.begin(),
make_pair(numLabel, nodeLink));
}
delete[] nodeLinksChars;
tree->agg = new RelAgg();
if (!RelAgg::readFromRecord(tree->agg, valueRecord, offset)) {
return false;
}
value.setAddr(tree);
return true;
}
void FPTree::Close(const ListExpr typeInfo, Word& w) {
FPTree *tree = (FPTree*)w.addr;
delete tree;
w.addr = 0;
}
Word FPTree::Clone(const ListExpr typeInfo, const Word& w) {
FPTree *tree = (FPTree*)w.addr;
Word res;
res.addr = new FPTree(*tree);
return res;
}
int FPTree::SizeOfObj() {
return sizeof(FPTree);
}
bool FPTree::TypeCheck(ListExpr type, ListExpr& errorInfo) {
return nl->IsEqual(type, BasicType());
}
/*
Type constructor for secondo type ~fptree~
*/
TypeConstructor fptreeTC(
FPTree::BasicType(),
FPTree::Property,
FPTree::Out,
FPTree::In,
0, 0,
FPTree::Create,
FPTree::Delete,
FPTree::Open,
FPTree::Save,
FPTree::Close,
FPTree::Clone,
0,
FPTree::SizeOfObj,
FPTree::TypeCheck);
/*
Class ~ProjectedDB~, function ~clear~
*/
void ProjectedDB::clear() {
}
/*
Class ~ProjectedDB~, function ~initialize~
*/
void ProjectedDB::initialize(const double ms, RelAgg *ra) {
minSupp = ms;
agg = ra;
minSuppCnt = (unsigned int)std::ceil(minSupp * ra->noTuples);
agg->checkedSeqs.resize(agg->maxNoAtoms + 1);
projections.resize(agg->freqLabels.size());
}
/*
Class ~ProjectedDB~, function ~addProjections~
*/
void ProjectedDB::addProjections(vector<unsigned int>& labelSeq, unsigned int
label, const vector<unsigned int>& prefix /* = vector<unsigned int>() */) {
if (labelSeq.empty()) {
return;
}
// cout << "compute proj for seq " << agg->print(labelSeq) << endl;
vector<bool> projPresent;
vector<unsigned int> reducedSeq;
projPresent.resize(agg->freqLabels.size(), false);
if (prefix.empty()) {
for (unsigned int pos = 0; pos < labelSeq.size() - 1; pos++) {
if (label == UINT_MAX || label == labelSeq[pos]) {
if (!projPresent[labelSeq[pos]]) {
reducedSeq.assign(labelSeq.begin() + pos + 1, labelSeq.end());
// cout << " projection(<" << labelSeq[pos] << ">) : "
// << agg->print(reducedSeq) << endl;
projections[labelSeq[pos]].push_back(reducedSeq);
projPresent[labelSeq[pos]] = true;
}
}
}
}
else {
for (unsigned int pos = 0; pos < labelSeq.size() - 1; pos++) {
if (label == UINT_MAX || label == labelSeq[pos]) {
if (!projPresent[labelSeq[pos]]) {
for (unsigned int i = pos + 1; i < labelSeq.size(); i++) {
if (std::find(prefix.begin(), prefix.end(), labelSeq[i]) ==
prefix.end()) {
reducedSeq.push_back(labelSeq[i]);
}
// cout << " projection(<" << labelSeq[pos] << ">) : "
// << agg->print(reducedSeq) << endl;
}
if (!reducedSeq.empty()) {
projections[labelSeq[pos]].push_back(reducedSeq);
projPresent[labelSeq[pos]] = true;
reducedSeq.clear();
}
}
}
}
}
}
/*
Class ~ProjectedDB~, function ~computeSMatrix~
*/
void ProjectedDB::computeSMatrix(vector<unsigned int>& freqLabels,
vector<NewPair<unsigned int, unsigned int> >& fPos) {
// cout << "computeSMatrix - freq Labels: " << agg->print(freqLabels) << endl;
smatrix.init(freqLabels.size());
vector<bool> counted;
counted.resize(freqLabels.size(), false);
for (unsigned int i = 0; i < freqLabels.size(); i++) {
freqLabelPos.push_back(freqLabels[i]);
if (!projections[i].empty()) {
for (auto seq : projections[i]) {
for (auto label : seq) {
if (!counted[label] && label != freqLabels[i]) {
smatrix.increment(i, label);
if (smatrix[i][label] == minSuppCnt) {
fPos.push_back(NewPair<unsigned int, unsigned int>(i, label));
}
counted[label] = true;
}
}
counted.assign(freqLabels.size(), false);
}
}
}
}
/*
Class ~ProjectedDB~, function ~construct~
*/
void ProjectedDB::construct() {
// cout << "frequent labels: " << agg->print(agg->freqLabels) << endl;
GenericRelationIterator* it = agg->rel->MakeScan();
MLabel *ml = 0;
Tuple *tuple = 0;
vector<unsigned int> labelSeq;
vector<vector<unsigned int> > projection;
while ((tuple = it->GetNextTuple())) {
ml = (MLabel*)(tuple->GetAttribute(agg->attrPos.first));
agg->getLabelSeqFromMLabel(ml, labelSeq);
addProjections(labelSeq, UINT_MAX); // add all available projections
tuple->DeleteIfAllowed();
}
delete it;
for (unsigned int i = 0; i < projections.size(); i++) {
if (projections[i].size() < minSuppCnt) {
// cout << " remove " << projections[i].size() << " projs for label "
// << agg->freqLabels[i] << endl;
projections[i].clear();
}
}
vector<unsigned int> freqLabels;
for (unsigned int i = 0; i < agg->freqLabels.size(); i++) {
freqLabels.push_back(i);
}
computeSMatrix(freqLabels, freqMatrixPos);
// cout << "flPos: " << agg->print(freqLabelPos) << endl << "SMatrix:" << endl
// << smatrix.print() << endl;
}
/*
Class ~ProjectedDB~, Function ~minePDB~
*/
void ProjectedDB::minePDB(vector<unsigned int>& prefix, string& patPrefix,
unsigned int pos, const unsigned int minNoAtoms,
const unsigned int maxNoAtoms) {
// cout << "minePDB for prefix " << agg->print(prefix) << "; "
// << projections[pos].size() << " projs for pos " << pos << endl;
if (prefix.size() + 1 > maxNoAtoms || projections[pos].size() < minSuppCnt) {
return;
}
// compute frequent labels
vector<unsigned int> labelCounter;
set<unsigned int> newFreqLabels;
labelCounter.resize(agg->freqLabels.size(), 0);
vector<bool> hasBeenCounted;
hasBeenCounted.resize(agg->freqLabels.size(), false);
for (auto seq : projections[pos]) {
for (auto label : seq) {
if (!hasBeenCounted[label]) {
labelCounter[label]++;
hasBeenCounted[label] = true;
}
}
hasBeenCounted.assign(hasBeenCounted.size(), false);
}
vector<vector<unsigned int> > reducedSeqs;
vector<unsigned int> reducedSeq;
unsigned lastLabel = UINT_MAX;
// compute reduced sequences
for (auto seq : projections[pos]) {
for (auto label : seq) {
if (labelCounter[label] >= minSuppCnt && label != lastLabel) {
reducedSeq.push_back(label);
lastLabel = label;
newFreqLabels.insert(label);
}
}
if (!reducedSeq.empty()) {
reducedSeqs.push_back(reducedSeq);
reducedSeq.clear();
}
lastLabel = UINT_MAX;
}
// cout << " reduced projections for prefix " << agg->print(prefix) << ": ";
// for (auto seq : reducedSeqs) {
// cout << agg->print(seq) << ", ";
// }
// cout << "; frequent labels: " << agg->print(newFreqLabels) << endl;
// build atoms for prefix
set<TupleId> commonTupleIds;
string atom, patPrefixExt;
ProjectedDB *pdb = new ProjectedDB(minSupp, minSuppCnt, agg,
agg->freqLabels.size());
// cout << "### NEW pdb CREATED for prefix " << agg->print(prefix) << endl;
// for (unsigned int i = 0; i < prefix.size(); i++) {
// agg->buildAtom(prefix[i], agg->entries[prefix[i]], commonTupleIds, atom);
// patPrefix += atom + " ";
// }
// complete patterns, recursion with extended prefixes, TODO: use intersection
for (auto flabel : newFreqLabels) {
if (flabel != prefix[prefix.size() - 1]) {
// cout << " result sequence " << agg->print(prefix) << ", " << flabel
// << " found" << endl;
agg->buildAtom(flabel, agg->entries[flabel], commonTupleIds, atom);
patPrefixExt = patPrefix + " " + atom;
if (prefix.size() + 1 >= minNoAtoms) {
agg->results.push_back(NewPair<string, double>(patPrefixExt,
(double)labelCounter[flabel] / agg->noTuples));
}
atom.clear();
if (prefix.size() + 1 < maxNoAtoms) {
prefix.push_back(flabel);
for (auto seq : reducedSeqs) {
if (!projections[flabel].empty()) { // Optimization 1 from PS paper
pdb->addProjections(seq, flabel);
}
// cout << " //// projections for seq " << agg->print(seq)
// << " added; projections[" << flabel << "] has "
// << pdb->projections[flabel].size() << " elements, "
// << reducedSeqs.size() << " rseqs total" << endl;
}
for (unsigned int i = 0; i < pdb->projections.size(); i++) {
if (pdb->projections[i].size() < pdb->minSuppCnt) {
// cout << " remove " << pdb->projections[i].size()
// << " projs for label " << pdb->agg->freqLabels[i] << endl;
pdb->projections[i].clear();
}
// else { TODO: restore this path if this function is invoked
// pdb->projPos.push_back(i);
// }
}
pdb->minePDB(prefix, patPrefixExt, flabel, minNoAtoms, maxNoAtoms);
prefix.pop_back();
}
}
}
delete pdb;
}
/*
Class ~ProjectedDB~, function ~retrievePatterns~
compute projections, report frequent items, build S-matrix, report freq cells
*/
void ProjectedDB::minePDBSMatrix(vector<unsigned int>& prefix,
string& patPrefix, unsigned int minNoAtoms, unsigned int maxNoAtoms) {
unsigned int secondLast(prefix[prefix.size() - 2]),
last(prefix[prefix.size() - 1]);
vector<unsigned int> labelCounter;
// cout << "FreqLabelPos: " << agg->print(freqLabelPos) << endl;
labelCounter.resize(agg->freqLabels.size(), 0);
vector<bool> counted;
counted.resize(labelCounter.size(), false);
string atom, pattern;
vector<string> atoms;
set<TupleId> commonTupleIds;
vector<unsigned int> freqLabels;
ProjectedDB *pdb = new ProjectedDB(minSupp, minSuppCnt, agg,
projections.size());
for (auto seq : projections[secondLast]) {
pdb->addProjections(seq, last, prefix);
}
// cout << "Prefix " << agg->print(prefix) << " : " << endl;
// for (unsigned int i = 0; i < pdb->projections.size(); i++) {
// cout << " pos " << i << " : "
// << nl->ToString(projToListExpr(pdb->projections[i])) << endl;
// }
for (auto seq : pdb->projections[last]) {
// cout << " count sequence " << agg->print(seq) << endl;
for (auto label : seq) {
if (!counted[label]) {
labelCounter[label]++;
counted[label] = true;
}
}
counted.assign(counted.size(), false);
}
// cout << "labelCounter: " << agg->print(labelCounter) << endl
// << "freqLabelPos: " << agg->print(freqLabelPos) << endl;
// build patterns for frequently occurring labels in projections
map<unsigned int, unsigned int> matrixPos;
for (unsigned int label = 0; label < labelCounter.size(); label++) {
if (labelCounter[label] >= minSuppCnt) {
freqLabels.push_back(label);
matrixPos[label] = freqLabels.size() - 1;
pdb->freqLabelPos.push_back(label);
agg->buildAtom(label, agg->entries[label], commonTupleIds, atom);
atoms.push_back(atom);
if (prefix.size() + 1 >= minNoAtoms) {
// cout << " RESULT " << agg->print(prefix) << " " << label << endl;
agg->results.push_back(NewPair<string, double>(patPrefix + " " + atom,
double(labelCounter[label]) / agg->noTuples));
}
}
}
if (prefix.size() + 2 > maxNoAtoms) {
return;
}
// build S-matrix
pdb->smatrix.init(pdb->freqLabelPos.size());
// cout << "build S-matrix for freqLabels " << agg->print(freqLabels) << endl;
for (auto seq : pdb->projections[last]) {
for (unsigned int i = 0; i < seq.size(); i++) {
if (labelCounter[seq[i]] >= minSuppCnt) {
for (unsigned int j = i + 1; j < seq.size(); j++) {
if (labelCounter[seq[j]] >= minSuppCnt) {
pdb->smatrix.increment(matrixPos[seq[i]], matrixPos[seq[j]]);
if (pdb->smatrix(matrixPos[seq[i]], matrixPos[seq[j]]) ==
minSuppCnt) {
pdb->freqMatrixPos.push_back(NewPair<unsigned int, unsigned int>
(matrixPos[seq[i]], matrixPos[seq[j]]));
}
}
}
}
}
}
// cout << pdb->smatrix.print() << " freqMatrixPos = {";
// for (auto it : pdb->freqMatrixPos) {
// cout << "(" << it.first << ", " << it.second << ") ";
// }
// cout << "}" << endl;
// prepare projections for longer prefix
if (prefix.size() + 1 <= maxNoAtoms) {
for (auto seq : pdb->projections[last]) {
pdb->addProjections(seq, UINT_MAX, prefix);
}
}
pdb->projections[last].clear();
// for (unsigned int i = 0; i < pdb->projections.size(); i++) {
// cout << " ### pos " << i << " : "
// << nl->ToString(projToListExpr(pdb->projections[i])) << endl;
// }
// report frequent matrix positions
for (auto fPos : pdb->freqMatrixPos) {
if (std::find(prefix.begin(), prefix.end(),
pdb->freqLabelPos[fPos.first]) == prefix.end() &&
std::find(prefix.begin(), prefix.end(),
pdb->freqLabelPos[fPos.second]) == prefix.end()) {
prefix.push_back(pdb->freqLabelPos[fPos.first]);
prefix.push_back(pdb->freqLabelPos[fPos.second]);
pattern = patPrefix + " " + atoms[fPos.first] + " " + atoms[fPos.second];
if (prefix.size() >= minNoAtoms) {
agg->results.push_back(NewPair<string, double>(pattern,
(double)pdb->smatrix(fPos.first, fPos.second) / agg->noTuples));
}
if (prefix.size() + 1 <= maxNoAtoms) {
pdb->minePDBSMatrix(prefix, pattern, minNoAtoms, maxNoAtoms);
}
prefix.pop_back();
prefix.pop_back();
}
}
delete pdb;
}
/*
Class ~ProjectedDB~, function ~retrievePatterns~
*/
void ProjectedDB::retrievePatterns(const unsigned int minNoAtoms,
const unsigned int maxNoAtoms) {
vector<string> atoms;
string atom, patPrefix;
atoms.resize(agg->freqLabels.size(), "");
set<TupleId> commonTupleIds;
double supp = 1.0;
vector<unsigned int> prefix;
for (unsigned int l = 0; l < agg->freqLabels.size(); l++) {
agg->buildAtom(l, agg->entries[l], commonTupleIds, atoms[l]);
if (minNoAtoms == 1) {
supp = double(agg->entries[l].occs.size()) / agg->noTuples;
agg->results.push_back(NewPair<string, double>(atoms[l], supp));
}
}
cout << agg->results.size() << " frequent 1-patterns found" << endl;
if (maxNoAtoms >= 2) {
for (auto fPos : freqMatrixPos) {
agg->buildAtom(freqLabelPos[fPos.second],
agg->entries[freqLabelPos[fPos.second]], commonTupleIds, atom);
patPrefix = atoms[freqLabelPos[fPos.first]] + " " + atom;
if (minNoAtoms <= 2) {
agg->results.push_back(NewPair<string, double>(patPrefix,
(double)smatrix(fPos.first, fPos.second) / agg->noTuples));
}
if (maxNoAtoms >= 3) {
prefix.push_back(fPos.first);
prefix.push_back(fPos.second);
minePDBSMatrix(prefix, patPrefix, minNoAtoms, maxNoAtoms);
prefix.clear();
}
}
}
// vector<unsigned int> prefix;
// for (unsigned int i = 0; i < projPos.size(); i++) {
// prefix.push_back(projPos[i]);
// minePDB(prefix, atoms[projPos[i]], projPos[i], minNoAtoms, maxNoAtoms);
// prefix.pop_back();
// }
std::sort(agg->results.begin(), agg->results.end(), comparePMResults());
}
/*
Class ~ProjectedDB~, Function ~computeProjSize~
Compute storage space in bytes for all projections:
*/
unsigned long long int ProjectedDB::computeProjSize() const {
unsigned long long int result = sizeof(unsigned int);
for (unsigned int i = 0; i < projections.size(); i++) {
result += sizeof(unsigned int);
for (unsigned int j = 0; j < projections[i].size(); j++) {
result += sizeof(unsigned int) * (projections[i][j].size() + 1);
}
}
return result;
}
/*
Class ~ProjectedDB~, Function ~computeMatrixSize~
Compute storage space in bytes for smatrix:
*/
unsigned long long int ProjectedDB::computeMatrixSize() const {
unsigned long long int result = sizeof(unsigned int) *
(smatrix.size * smatrix.size + 1);
return result;
}
/*
Class ~ProjectedDB~, Function ~computeFreqLabelPosSize~
Compute storage space in bytes for freqLabelPos:
*/
unsigned long long int ProjectedDB::computeFreqLabelPosSize() const {
unsigned long long int result = sizeof(unsigned int)*(freqLabelPos.size()+1);
return result;
}
/*
Class ~ProjectedDB~, Function ~computeFreqMatrixPosSize~
Compute storage space in bytes for freqMatrixPos:
*/
unsigned long long int ProjectedDB::computeFreqMatrixPosSize() const {
unsigned long long int result = sizeof(unsigned int) *
(2 * freqMatrixPos.size() + 1);
return result;
}
/*
Class ~ProjectedDB~, functions for secondo data type
*/
ListExpr ProjectedDB::seqToListExpr(vector<unsigned int>& seq) {
ListExpr seqList(nl->Empty()), seqListTemp;
if (!seq.empty()) {
seqList = nl->OneElemList(nl->IntAtom(seq[0]));
seqListTemp = seqList;
}
for (unsigned int i = 1; i < seq.size(); i++) {
seqListTemp = nl->Append(seqListTemp, nl->IntAtom(seq[i]));
}
return seqList;
}
ListExpr ProjectedDB::projToListExpr(vector<vector<unsigned int> >& proj) {
ListExpr projList(nl->Empty()), projListTemp;
if (!proj.empty()) {
projList = nl->OneElemList(seqToListExpr(proj[0]));
projListTemp = projList;
}
for (unsigned int i = 1; i < proj.size(); i++) {
projListTemp = nl->Append(projListTemp, seqToListExpr(proj[i]));
}
return projList;
}
ListExpr ProjectedDB::Out(ListExpr typeInfo, Word value) {
ProjectedDB *pdb = (ProjectedDB*)value.addr;
ListExpr noTuplesList = nl->TwoElemList(nl->SymbolAtom("noTuples"),
nl->IntAtom(pdb->agg->noTuples));
ListExpr minSuppList = nl->TwoElemList(nl->SymbolAtom("minSupp"),
nl->RealAtom(pdb->minSupp));
ListExpr projList(nl->Empty()), projsList(nl->Empty());
if (!pdb->projections.empty()) {
projsList = nl->OneElemList(nl->TwoElemList(nl->SymbolAtom(
pdb->agg->freqLabels[0]), pdb->projToListExpr(pdb->projections[0])));
projList = projsList;
}
for (unsigned int i = 1; i < pdb->projections.size(); i++) {
projList = nl->Append(projList,
nl->TwoElemList(nl->TextAtom(pdb->agg->freqLabels[i]),
pdb->projToListExpr(pdb->projections[i])));
}
return nl->ThreeElemList(noTuplesList, minSuppList, projsList);
}
ListExpr ProjectedDB::Property() {
return (nl->TwoElemList(
nl->FourElemList(
nl->StringAtom("Signature"), nl->StringAtom("Example Type List"),
nl->StringAtom("List Rep"), nl->StringAtom("Example List")),
nl->FourElemList (
nl->StringAtom("-> SIMPLE"),
nl->StringAtom(ProjectedDB::BasicType()),
nl->StringAtom("no list representation"),
nl->StringAtom(""))));
}
Word ProjectedDB::In(const ListExpr typeInfo, const ListExpr instance,
const int errorPos, ListExpr& errorInfo, bool& correct) {
correct = false;
return SetWord(Address(0));
}
Word ProjectedDB::Create(const ListExpr typeInfo) {
Word w;
w.addr = (new ProjectedDB());
return w;
}
void ProjectedDB::Delete(const ListExpr typeInfo, Word& w) {
ProjectedDB *pdb = (ProjectedDB*)w.addr;
delete pdb;
w.addr = 0;
}
bool ProjectedDB::Save(SmiRecord& valueRecord, size_t& offset,
const ListExpr typeInfo, Word& value) {
ProjectedDB *pdb = (ProjectedDB*)value.addr;
// store minSupp
if (!valueRecord.Write(&pdb->minSupp, sizeof(double), offset)) {
return false;
}
offset += sizeof(double);
// store agg
if (!RelAgg::saveToRecord(pdb->agg, valueRecord, offset)) {
return false;
}
// prepare storing everything else in one record
unsigned long long int recordSize = pdb->computeProjSize() +
pdb->computeMatrixSize() + pdb->computeFreqLabelPosSize() +
pdb->computeFreqMatrixPosSize();
cout << "size of record is " << recordSize << endl;
if (!valueRecord.Write(&recordSize, sizeof(unsigned long long int), offset)) {
return false;
}
offset += sizeof(unsigned long long int);
char* recordChars = new char[recordSize];
size_t recordOffset = 0;
unsigned int noProjections(pdb->projections.size()),
noFreqLabelPos(pdb->freqLabelPos.size()),
noFreqMatrixPos(pdb->freqMatrixPos.size()),
noSequences, noLabels, label, freqLabelPos;
// store projections
memcpy(recordChars + recordOffset, &noProjections, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
for (unsigned int i = 0; i < noProjections; i++) {
noSequences = pdb->projections[i].size();
memcpy(recordChars + recordOffset, &noSequences, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
for (unsigned int j = 0; j < noSequences; j++) {
noLabels = pdb->projections[i][j].size();
memcpy(recordChars + recordOffset, &noLabels, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
for (unsigned int k = 0; k < noLabels; k++) {
label = pdb->projections[i][j][k];
memcpy(recordChars + recordOffset, &label, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
}
}
}
// store smatrix
memcpy(recordChars + recordOffset, &pdb->smatrix.size, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
memcpy(recordChars + recordOffset, pdb->smatrix.values,
pdb->smatrix.size * pdb->smatrix.size * sizeof(unsigned int));
recordOffset += pdb->smatrix.size * pdb->smatrix.size * sizeof(unsigned int);
// for (unsigned int i = 0; i < pdb->smatrix.size; i++) {
// for (unsigned int j = 0; j < pdb->smatrix.size; j++) {
// matrixEntry = pdb->smatrix(i, j);
// memcpy(recordChars + recordOffset, &matrixEntry, sizeof(unsigned int));
// recordOffset += sizeof(unsigned int);
// }
// }
// store freqLabelPos
memcpy(recordChars + recordOffset, &noFreqLabelPos, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
for (unsigned int i = 0; i < noFreqLabelPos; i++) {
freqLabelPos = pdb->freqLabelPos[i];
memcpy(recordChars + recordOffset, &freqLabelPos, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
}
// store freqMatrixPos
memcpy(recordChars + recordOffset, &noFreqMatrixPos, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
for (auto freqPos : pdb->freqMatrixPos) {
memcpy(recordChars + recordOffset, &freqPos.first, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
memcpy(recordChars + recordOffset, &freqPos.second, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
}
if (!valueRecord.Write(recordChars, recordSize, offset)) {
return false;
}
offset += recordSize;
delete[] recordChars;
return true;
}
bool ProjectedDB::Open(SmiRecord& valueRecord, size_t& offset,
const ListExpr typeInfo, Word& value) {
// auto measureStart = high_resolution_clock::now();
ProjectedDB *pdb = new ProjectedDB();
// read minSupp
if (!valueRecord.Read(&pdb->minSupp, sizeof(double), offset)) {
return false;
}
offset += sizeof(double);
pdb->agg = new RelAgg();
if (!RelAgg::readFromRecord(pdb->agg, valueRecord, offset)) {
return false;
}
pdb->minSuppCnt = (unsigned int)std::ceil(pdb->minSupp * pdb->agg->noTuples);
// open projections
vector<vector<unsigned int> > projection;
vector<unsigned int> labelSeq;
unsigned long long int recordSize;
if (!valueRecord.Read(&recordSize, sizeof(unsigned long long int), offset)) {
return false;
}
offset += sizeof(unsigned long long int);
// cout << "size of record is " << recordSize << endl;
char* recordChars = new char[recordSize];
if (!valueRecord.Read(recordChars, recordSize, offset)) {
return false;
}
offset += recordSize;
size_t recordOffset = 0;
unsigned int noProjections, noSequences, noLabels, label, noFreqLabelPos,
matrixSize, freqLabelPos, noFreqMatrixPos;
memcpy(&noProjections, recordChars + recordOffset, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
for (unsigned int i = 0; i < noProjections; i++) {
memcpy(&noSequences, recordChars + recordOffset, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
for (unsigned int j = 0; j < noSequences; j++) {
memcpy(&noLabels, recordChars + recordOffset, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
for (unsigned int k = 0; k < noLabels; k++) {
memcpy(&label, recordChars + recordOffset, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
labelSeq.push_back(label);
}
projection.push_back(labelSeq);
labelSeq.clear();
}
pdb->projections.push_back(projection);
projection.clear();
}
// read smatrix
memcpy(&matrixSize, recordChars + recordOffset, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
pdb->smatrix.init(matrixSize);
memcpy(pdb->smatrix.values, recordChars + recordOffset,
pdb->smatrix.size * pdb->smatrix.size * sizeof(unsigned int));
recordOffset += pdb->smatrix.size * pdb->smatrix.size * sizeof(unsigned int);
// for (unsigned int i = 0; i < matrixSize; i++) {
// for (unsigned int j = 0; j < matrixSize; j++) {
// memcpy(&matrixEntry, recordChars + recordOffset, sizeof(unsigned int));
// recordOffset += sizeof(unsigned int);
// pdb->smatrix.set(i, j, matrixEntry);
// }
// }
// read freqLabelPos
memcpy(&noFreqLabelPos, recordChars + recordOffset, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
for (unsigned int i = 0; i < noFreqLabelPos; i++) {
memcpy(&freqLabelPos, recordChars + recordOffset, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
pdb->freqLabelPos.push_back(freqLabelPos);
}
value.setAddr(pdb);
// measureStop = high_resolution_clock::now();
// ms =
// (double)(duration_cast<milliseconds>(measureStop - measureStart).count());
// read freqMatrixPos
memcpy(&noFreqMatrixPos, recordChars + recordOffset, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
NewPair<unsigned int, unsigned int> freqPos;
for (unsigned int i = 0; i < noFreqMatrixPos; i++) {
memcpy(&freqPos.first, recordChars + recordOffset, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
memcpy(&freqPos.second, recordChars + recordOffset, sizeof(unsigned int));
recordOffset += sizeof(unsigned int);
pdb->freqMatrixPos.push_back(freqPos);
}
delete[] recordChars;
return true;
}
void ProjectedDB::Close(const ListExpr typeInfo, Word& w) {
ProjectedDB *pdb = (ProjectedDB*)w.addr;
delete pdb;
w.addr = 0;
}
Word ProjectedDB::Clone(const ListExpr typeInfo, const Word& w) {
ProjectedDB *pdb = (ProjectedDB*)w.addr;
Word res;
res.addr = new ProjectedDB(*pdb);
return res;
}
int ProjectedDB::SizeOfObj() {
return sizeof(ProjectedDB);
}
bool ProjectedDB::TypeCheck(ListExpr type, ListExpr& errorInfo) {
return nl->IsEqual(type, BasicType());
}
/*
Type constructor for secondo type ~projecteddb~
*/
TypeConstructor projecteddbTC(
ProjectedDB::BasicType(),
ProjectedDB::Property,
ProjectedDB::Out,
ProjectedDB::In,
0, 0,
ProjectedDB::Create,
ProjectedDB::Delete,
ProjectedDB::Open,
ProjectedDB::Save,
ProjectedDB::Close,
ProjectedDB::Clone,
0,
ProjectedDB::SizeOfObj,
ProjectedDB::TypeCheck);
GetPatternsLI::GetPatternsLI(Relation *r, const NewPair<int, int> ap, double ms,
int mina, int maxa, Geoid *g, const size_t mem) {
tupleType = getTupleType();
agg.clear();
agg.scanRelation(r, ap, g);
agg.filter(ms, mem);
agg.derivePatterns(mina, maxa);
}
GetPatternsLI::~GetPatternsLI() {
tupleType->DeleteIfAllowed();
}
TupleType* GetPatternsLI::getTupleType() {
SecondoCatalog* sc = SecondoSystem::GetCatalog();
ListExpr resultTupleType = nl->TwoElemList(
nl->SymbolAtom(Tuple::BasicType()),
nl->TwoElemList(nl->TwoElemList(nl->SymbolAtom("Pattern"),
nl->SymbolAtom(FText::BasicType())),
nl->TwoElemList(nl->SymbolAtom("Support"),
nl->SymbolAtom(CcReal::BasicType()))));
ListExpr numResultTupleType = sc->NumericType(resultTupleType);
return new TupleType(numResultTupleType);
}
Tuple* GetPatternsLI::getNextResult(RelAgg& agg, TupleType *tt) {
if (agg.results.empty()) {
return 0;
}
Tuple *tuple = new Tuple(tt);
NewPair<string, double> result;
result = agg.results.back();
agg.results.pop_back();
FText *pattern = new FText(true, result.first);
tuple->PutAttribute(0, pattern);
CcReal *support = new CcReal(true, result.second);
tuple->PutAttribute(1, support);
return tuple;
}
MineFPTreeLI::MineFPTreeLI(FPTree *t, int mina, int maxa) :
tree(t), minNoAtoms(mina), maxNoAtoms(maxa) {
tupleType = GetPatternsLI::getTupleType();
tree->retrievePatterns(minNoAtoms, maxNoAtoms);
}
MineFPTreeLI::~MineFPTreeLI() {
// tree->agg->clearEntries();
delete tree->agg;
tupleType->DeleteIfAllowed();
}
Tuple* MineFPTreeLI::getNextResult() {
return GetPatternsLI::getNextResult(*(tree->agg), tupleType);
}
PrefixSpanLI::PrefixSpanLI(ProjectedDB *db, int mina, int maxa) :
pdb(db), minNoAtoms(mina), maxNoAtoms(maxa) {
tupleType = GetPatternsLI::getTupleType();
pdb->retrievePatterns(minNoAtoms, maxNoAtoms);
}
PrefixSpanLI::~PrefixSpanLI() {
delete pdb->agg;
tupleType->DeleteIfAllowed();
}
Tuple* PrefixSpanLI::getNextResult() {
return GetPatternsLI::getNextResult(*(pdb->agg), tupleType);
}
/*
Class ~AggEntry~, function ~sequentialJoin~, applied by operator ~spade~
keep periods of ~entry2~ if occurring later than ~entry1~'s
*/
void AggEntry::sequentialJoin(AggEntry& entry1, AggEntry& entry2) {
if (entry1.occs.empty() || entry2.occs.empty()) {
return;
}
unsigned int pos1(0), pos2(0);
SecInterval iv1(true), iv2(true);
iv1.start.SetType(instanttype);
iv2.end.SetType(instanttype);
Periods *per1(0), *per2(0);
TupleId id1(0), id2(0);
Rect rect(true);
while (pos1 < entry1.occs.size() && pos2 < entry2.occs.size()) {
id1 = get<0>(entry1.occs[pos1]);
id2 = get<0>(entry2.occs[pos2]);
if (id1 < id2) {
pos1++;
}
else if (id1 > id2) {
pos2++;
}
else { // tuple ids match
per1 = get<1>(entry1.occs[pos1]);
per2 = get<1>(entry2.occs[pos2]);
if (per1->IsDefined() && per2->IsDefined()) {
if (!per1->IsEmpty() && !per2->IsEmpty()) {
per1->Get(0, iv1);
per2->Get(per2->GetNoComponents() - 1, iv2);
if (iv1.start < iv2.end) {
Periods *per = new Periods(*per2);
rect = get<2>(entry1.occs[pos1]).Union(get<2>(entry2.occs[pos2]));
occsPos.push_back(occs.size());
occs.push_back(make_tuple(id1, per, rect));
noOccs++;
// TODO: update duration
}
}
}
pos1++;
pos2++;
}
}
}
/*
Class ~RelAgg~, function ~combineFrom~, applied by operator ~spade~
*/
void RelAgg::combineEntries(unsigned int endOfPrefix, RelAgg *ra,
unsigned int label, unsigned int minSuppCnt) {
AggEntry newEntry;
// cout << "Join entries " << endOfPrefix << " and " << label << endl;
newEntry.sequentialJoin(ra->entries[endOfPrefix], ra->entries[label]);
if (newEntry.occs.size() >= minSuppCnt) {
labelPos.insert(make_pair(ra->freqLabels[label], label));
entries[label] = newEntry;
freqLabels[label] = ra->freqLabels[label];
// cout << "... added entry # " << label << " for "
// << ra->freqLabels[label] << ", has " << newEntry.occs.size()
// << " occs" << endl;
}
else {
freqLabels[label] = "";
}
}
/*
Class ~VerticalDB~, function ~mineVerticalDB~
*/
void VerticalDB::mineVerticalDB(vector<unsigned int>& prefix, string& patPrefix,
RelAgg *ra, const unsigned int minNoAtoms, const unsigned int maxNoAtoms) {
if (prefix.empty()) {
return;
}
// cout << "mVDB called with prefix " << ra->print(prefix)
// << ", " << ra->freqLabels.size() << " frequent labels and "
// << ra->labelPos.size() << " label pos" << endl;
string atom, patPrefixExt;
set<TupleId> commonTupleIds;
double supp = 0.0;
RelAgg *newAgg = new RelAgg(ra);
for (auto label : ra->labelPos) {
bool contained = false;
unsigned int prefixPos = 0;
while (!contained && prefixPos < prefix.size()) {
if (label.second == prefix[prefixPos]) {
contained = true;
}
prefixPos++;
}
if (!contained) {
newAgg->combineEntries(prefix[prefix.size() - 1], ra, label.second,
minSuppCnt);
// if (!newAgg->entries[label.second].occs.empty()) {
// cout << "isCombFrequent completed for prefix " << ra->print(prefix)
// << " and label " << label.second << "; "
// << ra->freqLabels[label.second] << " | "
// << newAgg->freqLabels[label.second] << " || "
// << newAgg->entries[label.second].occs.size() << " occs" << endl;
// }
}
}
for (auto label : newAgg->labelPos) {
newAgg->buildAtom(label.second, newAgg->entries[label.second],
commonTupleIds, atom);
supp = (double)newAgg->entries[label.second].occs.size() / newAgg->noTuples;
patPrefixExt = patPrefix + atom;
// cout << "RESULT: " << patPrefixExt << ", supp = " << supp << endl;
if (minNoAtoms <= prefix.size() + 1) {
agg->results.push_back(NewPair<string, double>(patPrefixExt, supp));
}
if (prefix.size() + 1 < maxNoAtoms) {
prefix.push_back(label.second);
mineVerticalDB(prefix, patPrefixExt, newAgg, minNoAtoms, maxNoAtoms);
prefix.pop_back();
}
}
delete newAgg;
}
/*
Class ~VerticalDB~, function ~retrievePatterns~
*/
void VerticalDB::retrievePatterns(const unsigned int minNoAtoms,
const unsigned int maxNoAtoms) {
vector<unsigned int> prefix;
string pattern, atom;
set<TupleId> commonTupleIds;
double supp = 1.0;
for (unsigned int l = 0; l < agg->freqLabels.size(); l++) {
agg->buildAtom(l, agg->entries[l], commonTupleIds, atom);
if (minNoAtoms == 1) {
supp = double(agg->entries[l].occs.size()) / agg->noTuples;
agg->results.push_back(NewPair<string, double>(atom, supp));
}
if (maxNoAtoms > 1) {
prefix.push_back(l);
atom += "";
mineVerticalDB(prefix, atom, agg, minNoAtoms, maxNoAtoms);
prefix.pop_back();
}
}
std::sort(agg->results.begin(), agg->results.end(), comparePMResults());
}
/*
Class ~VerticalDB~, function ~construct~
*/
void VerticalDB::construct() {
cout << "frequent labels: " << agg->print(agg->freqLabels) << endl;
}
/*
Class ~VerticalDB~, function ~initialize~
*/
void VerticalDB::initialize(const double ms, RelAgg *ra) {
minSupp = ms;
agg = ra;
minSuppCnt = (unsigned int)std::ceil(minSupp * ra->noTuples);
}
/*
Class ~VerticalDB~, functions for Secondo data type
*/
ListExpr VerticalDB::Property() {
return (nl->TwoElemList(
nl->FourElemList(
nl->StringAtom("Signature"), nl->StringAtom("Example Type List"),
nl->StringAtom("List Rep"), nl->StringAtom("Example List")),
nl->FourElemList (
nl->StringAtom("-> SIMPLE"),
nl->StringAtom(VerticalDB::BasicType()),
nl->StringAtom("no list representation"),
nl->StringAtom(""))));
}
Word VerticalDB::In(const ListExpr typeInfo, const ListExpr instance,
const int errorPos, ListExpr& errorInfo, bool& correct) {
correct = false;
return SetWord(Address(0));
}
ListExpr VerticalDB::Out(ListExpr typeInfo, Word value) {
VerticalDB *vdb = (VerticalDB*)value.addr;
ListExpr noTuplesList = nl->TwoElemList(nl->SymbolAtom("noTuples"),
nl->IntAtom(vdb->agg->noTuples));
ListExpr minSuppList = nl->TwoElemList(nl->SymbolAtom("minSupp"),
nl->RealAtom(vdb->minSupp));
return nl->TwoElemList(noTuplesList, minSuppList);
}
Word VerticalDB::Create(const ListExpr typeInfo) {
Word w;
w.addr = (new VerticalDB());
return w;
}
void VerticalDB::Delete(const ListExpr typeInfo, Word& w) {
VerticalDB *vdb = (VerticalDB*)w.addr;
delete vdb;
w.addr = 0;
}
bool VerticalDB::Save(SmiRecord& valueRecord, size_t& offset,
const ListExpr typeInfo, Word& value) {
VerticalDB *vdb = (VerticalDB*)value.addr;
// store minSupp
if (!valueRecord.Write(&vdb->minSupp, sizeof(double), offset)) {
return false;
}
offset += sizeof(double);
// store agg
if (!RelAgg::saveToRecord(vdb->agg, valueRecord, offset)) {
return false;
}
return true;
}
bool VerticalDB::Open(SmiRecord& valueRecord, size_t& offset,
const ListExpr typeInfo, Word& value) {
// auto measureStart = high_resolution_clock::now();
VerticalDB *vdb = new VerticalDB();
// read minSupp
if (!valueRecord.Read(&vdb->minSupp, sizeof(double), offset)) {
return false;
}
offset += sizeof(double);
vdb->agg = new RelAgg();
if (!RelAgg::readFromRecord(vdb->agg, valueRecord, offset)) {
return false;
}
vdb->minSuppCnt = (unsigned int)std::ceil(vdb->minSupp * vdb->agg->noTuples);
value.setAddr(vdb);
// auto measureStop = high_resolution_clock::now();
// double ms =
// (double)(duration_cast<milliseconds>(measureStop - measureStart).count());
// cout << "VDB OPEN finished after " << ms << " ms" << endl;
return true;
}
void VerticalDB::Close(const ListExpr typeInfo, Word& w) {
VerticalDB *vdb = (VerticalDB*)w.addr;
delete vdb;
w.addr = 0;
}
Word VerticalDB::Clone(const ListExpr typeInfo, const Word& w) {
VerticalDB *vdb = (VerticalDB*)w.addr;
Word res;
res.addr = new VerticalDB(*vdb);
return res;
}
int VerticalDB::SizeOfObj() {
return sizeof(VerticalDB);
}
bool VerticalDB::TypeCheck(ListExpr type, ListExpr& errorInfo) {
return nl->IsEqual(type, BasicType());
}
/*
Type constructor for secondo type ~projecteddb~
*/
TypeConstructor verticaldbTC(
VerticalDB::BasicType(),
VerticalDB::Property,
VerticalDB::Out,
VerticalDB::In,
0, 0,
VerticalDB::Create,
VerticalDB::Delete,
VerticalDB::Open,
VerticalDB::Save,
VerticalDB::Close,
VerticalDB::Clone,
0,
VerticalDB::SizeOfObj,
VerticalDB::TypeCheck);
SpadeLI::SpadeLI(VerticalDB *db, int mina, int maxa) :
vdb(db), minNoAtoms(mina), maxNoAtoms(maxa) {
tupleType = GetPatternsLI::getTupleType();
vdb->retrievePatterns(minNoAtoms, maxNoAtoms);
}
SpadeLI::~SpadeLI() {
delete vdb->agg;
tupleType->DeleteIfAllowed();
}
Tuple* SpadeLI::getNextResult() {
return GetPatternsLI::getNextResult(*(vdb->agg), tupleType);
}
/*
Class ~SplSemTraj~, functions for Secondo data type used for splitter
*/
int tsPlaceCmp(const void *a, const void *b) {
SplTSPlace *tsp1 = new ((void*)a)SplTSPlace,
*tsp2 = new ((void*)b)SplTSPlace;
if (tsp1->instDbl == tsp2->instDbl) {
return 0;
}
return tsp1->instDbl < tsp2->instDbl ? -1 : 1;
}
ListExpr SplSemTraj::Property() {
return nl->TwoElemList(
nl->FourElemList(
nl->StringAtom("Signature"), nl->StringAtom("Example Type List"),
nl->StringAtom("List Rep"), nl->StringAtom("Example List")),
nl->FourElemList (
nl->StringAtom("-> SIMPLE"),
nl->StringAtom(SplSemTraj::BasicType()),
nl->StringAtom("((t_1, l_1, c_1), ..., (t_n, l_n, c_n))"),
nl->StringAtom("((2021-09-09-12:45, (7.45, 51.49), \"Stadion\"))")));
}
// Word SplSemTraj::In(const ListExpr typeInfo, const ListExpr instance,
// const int errorPos, ListExpr& errorInfo, bool& correct) {
// correct = false;
// return SetWord(Address(0));
// }
bool SplSemTraj::ReadFrom(ListExpr LE, const ListExpr typeInfo) {
clear();
if (listutils::isSymbolUndefined(LE)) {
SetDefined(false);
return true;
}
SetDefined(true);
if (nl->IsEmpty(LE)) {
return true;
}
if (nl->IsAtom(LE)) {
return false;
}
ListExpr rest = LE;
SplTSPlace tsp;
while (!nl->IsEmpty(rest)) {
ListExpr first = nl->First(rest);
if (!tsp.fromList(first)) {
return false;
}
append(tsp);
rest = nl->Rest(rest);
}
sort();
return true;
}
ListExpr SplSemTraj::ToListExpr(ListExpr typeInfo) const {
if (!IsDefined()) {
return listutils::getUndefined();
}
if (isEmpty()) {
return nl->Empty();
}
ListExpr resultList = nl->OneElemList(get(0).toListExpr());
ListExpr resultListTemp = resultList;
for (int i = 1; i < size(); i++) {
resultListTemp = nl->Append(resultListTemp, get(i).toListExpr());
}
return resultList;
}
int SplSemTraj::Compare(const Attribute* arg) const {
if (!IsDefined()) {
return arg->IsDefined() ? -1 : 0;
}
if (!arg->IsDefined()) {
return 1;
}
SplSemTraj* sst = (SplSemTraj*)arg;
if (isEmpty()) {
return sst->isEmpty() ? 0 : 1;
}
if (sst->isEmpty()) {
return 1;
}
if (size() != sst->size()) {
return size() < sst->size() ? -1 : 1;
}
for (int i = 0; i < size(); i++) {
SplTSPlace ts1 = get(i);
SplTSPlace ts2 = sst->get(i);
if (ts1.instDbl != ts2.instDbl) {
return ts1.instDbl < ts2.instDbl;
}
}
return 0;
}
bool SplSemTraj::Adjacent(const Attribute* arg) const {
return false;
}
Attribute* SplSemTraj::Clone() const {
return new SplSemTraj(*this);
}
size_t SplSemTraj::HashValue() const {
if (!IsDefined() || isEmpty()) {
return 0;
}
return firstInst().GetAllMilliSeconds() * size() % 1024;
}
void SplSemTraj::CopyFrom(const Attribute* arg) {
tsPlaces.copyFrom(((SplSemTraj*)arg)->getTSPlaces());
}
bool SplSemTraj::CheckKind(ListExpr type, ListExpr& errorInfo) {
return checkType(type);
}
size_t SplSemTraj::Sizeof() const {
return sizeof(SplSemTraj);
}
void SplSemTraj::sort() {
tsPlaces.Sort(tsPlaceCmp);
}
int SplSemTraj::find(const SplPlace& sp, const double tolerance,
const Geoid *geoid) const {
Point p1(true), p2(true, sp.x, sp.y);
for (int i = 0; i < size(); i++) {
SplTSPlace tsp = get(i);
if (tsp.cat == sp.cat) {
p1.Set(tsp.x, tsp.y);
if (p1.Distance(p2, geoid) <= tolerance) {
return i;
}
}
}
return -1;
}
set<int> SplSemTraj::getPositions(string label) const {
set<int> result;
for (int i = 0; i < size(); i++) {
SplTSPlace tsp = get(i);
string currentLabel(tsp.cat);
if (currentLabel == label) {
result.insert(i);
}
}
return result;
}
void SplSemTraj::convertFromMPointMLabel(const MPoint& mp, const MLabel& ml,
const double tolerance, const Geoid *geoid /* = 0 */) {
clear();
DateTime inst(instanttype);
UPoint up(true);
IPoint ip(true);
Point pt(true), lastpt(false);
ILabel il(true);
string cat, lastcat;
for (int i = 0; i < mp.GetNoComponents(); i++) {
mp.Get(i, up);
inst = up.timeInterval.start;
up.TemporalFunction(inst, pt, geoid, true);
ml.AtInstant(inst, il);
if (il.value.IsDefined()) {
cat = il.value.GetLabel().substr(0, 48);
if (i == 0 || cat != lastcat || pt.Distance(lastpt, geoid) > tolerance) {
SplTSPlace tsPlace(inst, pt, cat);
append(tsPlace);
lastpt = pt;
lastcat = cat;
}
}
}
}
bool SplSemTraj::contains(const SplPlace& sp, const double deltaT,
const double tolerance, const Geoid* geoid) const {
Point p1(true), p2(true, sp.x, sp.y);
for (int i = 0; i < size(); i++) {
SplTSPlace tsp = get(i);
p1.Set(tsp.x, tsp.y);
if (tsp.cat == sp.cat && p1.Distance(p2, geoid) <= tolerance &&
tsp.instDbl < deltaT) {
return true;
}
}
return false;
}
SplSemTraj SplSemTraj::postfix(const int pos) const {
assert(pos >= 0 && pos < size());
SplSemTraj result(1);
DateTime t_m(0.0), t_i(0.0);
t_m.ReadFrom(get(pos).instDbl);
t_m.SetType(durationtype);
for (int i = pos + 1; i < size(); i++) {
SplTSPlace tsp = get(i);
t_i.ReadFrom(tsp.instDbl);
t_i -= t_m;
tsp.instDbl = t_i.ToDouble();
result.append(tsp);
}
return result;
}
void SplSemTraj::addPostfixes(SplPlace sp, const double eps, const Geoid* geoid,
vector<SplSemTraj>& result) const {
set<int> labelPos = getPositions(sp.cat);
for (auto it : labelPos) {
if (it >= 1) {
SplSemTraj pf = postfix(it);
if (!pf.isEmpty() && !pf.first().almostEqual(sp, eps, geoid)) {
result.push_back(pf);
}
}
}
}
/*
Implementation of class ~Splitter~, used for operator ~splitter~
*/
Splitter::Splitter(Word& s, const double sm, DateTime& mtt, const int mna,
const double e, Geoid* g, const int attrNo) :
pos(0), deltaT(mtt), maxNoAtoms(mna), eps(e), geoid(g) {
tupleType = getTupleType();
initialProjection(s, sm, attrNo);
}
Splitter::~Splitter() {
tupleType->DeleteIfAllowed();
}
TupleType* Splitter::getTupleType() {
SecondoCatalog* sc = SecondoSystem::GetCatalog();
ListExpr resultTupleType = nl->TwoElemList(
nl->SymbolAtom(Tuple::BasicType()),
nl->TwoElemList(nl->TwoElemList(nl->SymbolAtom("Pattern"),
nl->SymbolAtom(FText::BasicType())),
nl->TwoElemList(nl->SymbolAtom("Support"),
nl->SymbolAtom(CcReal::BasicType()))));
ListExpr numResultTupleType = sc->NumericType(resultTupleType);
return new TupleType(numResultTupleType);
}
void Splitter::initialProjection(Word& s, const double sm, const int attrNo) {
map<SplPlace, set<int>, SplPlaceSorter> freqItems(SplPlaceSorter(eps, geoid));
computeFrequentItems(s, attrNo, sm, freqItems);
// cout << freqItemsToString(freqItems) << endl;
if (maxNoAtoms == 1) {
for (auto it : freqItems) {
SplSemTraj sst(1);
SplTSPlace tsp(DateTime(0.0), Point(true, it.first.x, it.first.y),
it.first.cat);
sst.append(tsp);
addSnippets(sst, it.second.size());
}
}
else {
for (auto it : freqItems) {
vector<SplSemTraj> postfixes;
for (auto i : it.second) {
source[i].addPostfixes(it.first, eps, geoid, postfixes);
}
SplSemTraj sst(1);
SplTSPlace tsp(DateTime(0.0), Point(true, it.first.x, it.first.y),
it.first.cat);
sst.append(tsp);
addSnippets(sst, it.second.size());
prefixSpan(sst, postfixes);
}
}
}
void Splitter::prefixSpan(SplSemTraj& prefix, vector<SplSemTraj> pf) {
if (prefix.size() >= maxNoAtoms) {
return;
}
map<SplPlace, set<int>, SplPlaceSorter> localFreqItems(SplPlaceSorter(eps,
geoid));
// cout << "Call prefixSpan for sst " << prefix.toString() << endl;
computeLocalFreqItems(prefix.last(), pf, localFreqItems);
// cout << freqItemsToString(localFreqItems) << endl;
map<SplPlace, vector<SplSemTraj>, SplPlaceSorter>
newPostfixes(SplPlaceSorter(eps, geoid));
for (auto it : localFreqItems) {
SplSemTraj p(prefix);
SplTSPlace tsp(DateTime(0.0), Point(true, it.first.x, it.first.y),
it.first.cat);
p.append(tsp);
vector<SplSemTraj> postfixes;
for (auto pos : it.second) {
pf[pos].addPostfixes(it.first, eps, geoid, postfixes);
}
addSnippets(p, it.second.size());
// cout << postfixesToString(p, postfixes) << endl;
prefixSpan(p, postfixes);
}
}
string Splitter::postfixesToString(SplSemTraj pref, vector<SplSemTraj>& pf) {
stringstream str;
str << "Postfixes of " << pref.toString() << " : " << endl;
for (auto sst : pf) {
str << " " << sst.toString() << endl;
}
return str.str();
}
string Splitter::freqItemsToString(map<SplPlace, set<int>,
SplPlaceSorter>& freqItems) {
stringstream str;
if (freqItems.empty()) {
str << "No frequent items" << endl;
return str.str();
}
for (auto it : freqItems) {
str << "Freq item " << it.first.toString() << " : ";
for (auto it2 : it.second) {
str << it2 << ", ";
}
str << endl;
}
str << endl;
return str.str();
}
void Splitter::computeFrequentItems(Word& s, const int attrNo, const double sm,
map<SplPlace, set<int>, SplPlaceSorter>& freqItems) {
// collect all SplPlaces with occurrences
map<SplPlace, set<int>, SplPlaceSorter> allItems(SplPlaceSorter(eps, geoid));
Stream<Tuple> stream(s);
stream.open();
Tuple* tuple = stream.request();
int counter = 0;
while (tuple) {
SplSemTraj sst(*(SplSemTraj*)(tuple->GetAttribute(attrNo)));
for (int i = 0; i < sst.size(); i++) {
SplPlace sp(sst.get(i));
allItems[sp].insert(counter);
}
source.push_back(sst);
tuple->DeleteIfAllowed();
tuple = stream.request();
counter++;
}
stream.close();
// retrieve frequent labels
freqmin = ceil(counter * sm);
for (auto it : allItems) {
if (it.second.size() >= freqmin) {
freqItems.insert(it);
}
}
}
void Splitter::computeLocalFreqItems(SplTSPlace tsp, vector<SplSemTraj> pf,
map<SplPlace, set<int>, SplPlaceSorter >& freqItems) {
map<SplPlace, set<int>, SplPlaceSorter> allItems(SplPlaceSorter(eps, geoid));
int counter = 0;
for (auto it : pf) {
for (int i = 0; i < it.size(); i++) {
SplTSPlace spl(it.get(i));
if (i > 0 || !spl.almostEqual(tsp, eps, geoid)) { // first place != spl
if (spl.instDbl < deltaT.ToDouble()) { //
allItems[spl].insert(counter);
}
}
}
counter++;
}
for (auto it : allItems) {
if (it.second.size() >= freqmin) {
freqItems.insert(it);
}
}
}
void Splitter::addSnippets(SplSemTraj sst, const int freq) {
if (sst.isEmpty()) {
return;
}
result.push_back(make_pair(sst, (double)freq / source.size()));
}
Tuple* Splitter::next() {
assert(pos <= result.size());
if (pos == result.size()) {
return 0;
}
Tuple *tuple = new Tuple(tupleType);
FText* pattern = new FText(true, result[pos].first.toString());
tuple->PutAttribute(0, pattern);
CcReal* ccsupp = new CcReal(true, result[pos].second);
tuple->PutAttribute(1, ccsupp);
pos++;
return tuple;
}
}