Files
secondo/Algebras/StreamMining/StreamMiningAlgebra.cpp
2026-01-23 17:03:45 +08:00

4081 lines
108 KiB
C++

/*
----
This file is part of SECONDO.
Copyright (C) 2004, University in Hagen, Department of Computer Science,
Database Systems for New Applications.
SECONDO is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
SECONDO is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SECONDO; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
----
//paragraph [1] title: [{\Large \bf ] [}]
//[->] [$\rightarrow$]
[1] Stream Mining Algebra
October 2021, T. Eschbach implemented this Algebra as part of his Bachelor Thesis
0 Overview
This algebra can be used to apply different Datamining techniques to Streams.
It provides the following operators:
* reservoir: stream x int [->] (stream)
Creates a reservoir sample of size int for a stream
* tilted: stream x int [->] (stream)
Creates a tilted time frame sample for a stream.
The type (natural, logarithmic, progressive logarithmic) depends
on the provided int.
* createbloomfilter: stream(tuple(X)) x ATTR x real -> bloomfilter
Creates a Bloomfilter for a Stream with maximum error probability float and size int.
* bloomcontains: bloomfilter x T [->] bool
Checks whether the provided Argument of Type T is present in the filter
* createcountmin: stream(tuple(X)) x ATTR x real x real -> countmin
Creates a Count-Min Sketch for a given Stream
1 Preliminaries
1.1 Includes
*/
#include "Algebra.h"
#include "NestedList.h"
#include "NList.h"
#include "QueryProcessor.h"
#include "AlgebraManager.h"
#include "StandardTypes.h"
#include "Symbols.h"
#include "Stream.h"
#include "ListUtils.h"
#include "Algebras/Standard-C++/LongInt.h"
#include "Algebras/Relation-C++/RelationAlgebra.h"
#include "MurmurHash.h"
#include "BloomFilter.h"
#include "CountMinSketch.h"
#include "amsSketch.h"
#include "lossyCounter.h"
#include "cPoint.h";
#include "Cluster.h";
#include "kMeans.h";
#include <string>
#include <iostream>
#include <vector>
#include <cmath>
#include <time.h>
using namespace std;
extern NestedList* nl;
extern QueryProcessor* qp;
extern AlgebraManager* am;
namespace eschbach {
/*
2 Algebra Implementation
2.1 Data Structures
2.1.1 Class ~BloomFilter~
*/
ScalableBloomFilter::ScalableBloomFilter
(const double inputFP) {
defined = true;
currentInserts = 0;
curFilterIndex = 0;
falsePositiveProbability = inputFP;
rollingFP = inputFP;
//Start out with a smaller filter, so not too much space is wasted
maxInserts = DEFAULT_SIZE;
filterSize = optimalSize(maxInserts, inputFP);
numHashfunctions = optimalHashes(maxInserts, filterSize);
//initialize the vector with as many bits as you expect entries;
//values are standard initialized which means false for bool values
filterList.resize(1);
filterList[0].resize(filterSize);
assert (numHashfunctions>0);
}
ScalableBloomFilter::ScalableBloomFilter(const ScalableBloomFilter& rhs) {
defined = rhs.defined;
falsePositiveProbability = rhs.falsePositiveProbability;
maxInserts = rhs.maxInserts;
currentInserts = rhs.currentInserts;
numHashfunctions = rhs.numHashfunctions;
ithFilterHashes = rhs.ithFilterHashes;
filterList = rhs.filterList;
}
//Setter and Getter
bool
ScalableBloomFilter::getDefined() const {
return defined;
}
void
ScalableBloomFilter::setDefined() {
defined = true;
}
size_t
ScalableBloomFilter::getCurMaxInserts() const{
return maxInserts;
}
double
ScalableBloomFilter::getFP() const{
return falsePositiveProbability;
}
double
ScalableBloomFilter::getRolFP() {
return rollingFP;
}
vector<bool>
ScalableBloomFilter::getSubFilter(size_t index){
return filterList[index];
}
void
ScalableBloomFilter::setSubFilter(vector<bool> inputSubFilter) {
filterList.push_back(inputSubFilter);
}
vector<vector<bool>>
ScalableBloomFilter::getFilterList() {
return filterList;
}
bool
ScalableBloomFilter::getElement(size_t filterIndex, size_t eleIndex) const{
return filterList[filterIndex][eleIndex];
}
bool
ScalableBloomFilter::setElement(size_t filterIndex,
size_t eleIndex, bool value) {
//Assign the previous Bitstate to use it in determining the fill ratio
bool oldValue = filterList[filterIndex][eleIndex];
filterList[filterIndex][eleIndex] = value;
return oldValue;
}
void
ScalableBloomFilter::setElementOpen(size_t filterIndex,
size_t eleIndex, bool value) {
filterList[filterIndex][eleIndex] = value;
}
void
ScalableBloomFilter::setFilterHashes(vector<int> nbrHashes) {
for (int nbr : nbrHashes) {
ithFilterHashes.push_back(nbr);
}
}
int
ScalableBloomFilter::getCurNumberHashes() const{
return numHashfunctions;
}
vector<int>
ScalableBloomFilter::getFilterHashes() const{
return ithFilterHashes;
}
size_t
ScalableBloomFilter::getCurFilterSize() const{
return filterList.back().size();
}
size_t
ScalableBloomFilter::getBloomSize() const {
return filterList.size();
}
//Auxiliary Functions
void
ScalableBloomFilter::initialize(double fp) {
defined = true;
falsePositiveProbability = fp;
rollingFP = fp;
maxInserts = DEFAULT_SIZE;
filterSize = optimalSize(maxInserts, fp);
numHashfunctions = optimalHashes(maxInserts, filterSize);
filterList.resize(1);
filterList[0].resize(filterSize);
ithFilterHashes.push_back(numHashfunctions);
}
size_t
ScalableBloomFilter::optimalSize(const long expectedInserts,
const double fPProb) {
size_t optimalSize = -expectedInserts*log(fPProb)/ pow(log(2),2);
if (optimalSize < 1) {
return 1;
}
return optimalSize;
}
long
ScalableBloomFilter::optimalHashes(const long expectedInserts,
const long filterSize) {
return (long) max(1, (int) round((long) filterSize/expectedInserts * log(2)));
}
bool
ScalableBloomFilter::contains(vector<size_t> hashResults,
int filterIndex) const {
bool present = true;
if (defined) {
for (size_t index : hashResults) {
if (!filterList[filterIndex][index]) {
present = false;
break;
}
}
}
return present;
}
void
ScalableBloomFilter::add(vector<size_t> hashResults) {
if (defined) {
//Use this Value to determine if adding an elements Hashvalues increased
//the filters Fillrate
bool alreadyAdded = true;
for (size_t eleIndex : hashResults) {
if (eleIndex > 0 && eleIndex < filterSize) {
alreadyAdded &= setElement(curFilterIndex, eleIndex, true);
}
}
if (!alreadyAdded) {
currentInserts++;
}
}
}
bool
ScalableBloomFilter::isSaturated() {
return currentInserts >= maxInserts;
}
// Update the parameters and add a new Subfilter to our Scalable Bloom
void
ScalableBloomFilter::updateFilterValues() {
cout << endl;
cout << "Updating filter Values; ";
curFilterIndex++;
maxInserts *= GROWTH_RATE;
currentInserts = 0;
rollingFP *= TIGHTENING_RATIO;
filterSize = optimalSize(maxInserts,rollingFP);
numHashfunctions = optimalHashes(maxInserts, filterSize);
filterList.resize(curFilterIndex+1);
filterList.back().resize(filterSize);
ithFilterHashes.push_back(numHashfunctions);
cout << "Filter now: " << endl;
cout << "Current Filter Index: " << curFilterIndex << endl;
cout << "Subfilters: " << filterList.size() << endl;
cout << "Current Filter Size: " << filterSize << endl;
cout << "Current Hashes: " << numHashfunctions << endl;
cout << "Previous Amount of Hashes in NbrHashesVector: "
<< ithFilterHashes[curFilterIndex-1] << endl;
cout << "Total Inserts: " << currentInserts << endl;
}
//~In~/~Out~ Functions
Word
ScalableBloomFilter::In(const ListExpr typeInfo, const ListExpr instance,
const int errorPos, ListExpr& errorInfo, bool& correct) {
Word result = SetWord(Address(0));
correct = false;
NList list (instance);
if(list.length() != 3){
cmsg.inFunError("expected three arguments");
return result;
}
NList first = list.first();
NList second = list.second();
NList third = list.third();
NList index;
if(!first.isReal() || !second.isInt() || !third.isList()) {
cmsg.inFunError("expected three numbers");
return result;
}
if (third.first().isBool()) {
float fp = first.realval();
size_t curFilterSize = second.intval();
ScalableBloomFilter* bloom = new ScalableBloomFilter(fp);
for (size_t i = 0; i < curFilterSize; i++) {
index = third.first();
third.rest();
bloom -> getSubFilter(0)[i] = index.boolval();
}
}
return result;
}
//Out-Function to turn List Representation into Class Representation
//Currently Dummy
ListExpr
ScalableBloomFilter::Out(ListExpr typeInfo, Word value) {
ScalableBloomFilter* bloomfilter =
static_cast<ScalableBloomFilter*> (value.addr);
if(!bloomfilter -> getDefined()) {
return listutils::getUndefined();
}
ListExpr elementList = nl -> OneElemList(nl->BoolAtom(
bloomfilter->getElement(0,0)));
ListExpr last = elementList;
for (size_t i = 0; i < bloomfilter -> getSubFilter(0).size(); i++) {
last = nl -> Append(last, nl->BoolAtom(bloomfilter->getElement(i,i)));
}
ListExpr returnList = nl -> ThreeElemList(
nl -> RealAtom(bloomfilter->getFP()),
nl -> IntAtom(bloomfilter->getCurFilterSize()),
last);
return returnList;
}
//Support Functions for Persistent Sorage
Word
ScalableBloomFilter::Create( const ListExpr typeInfo )
{
Word w;
w.addr = (new ScalableBloomFilter(0.1));
return w;
}
void
ScalableBloomFilter::Delete( const ListExpr typeInfo, Word& w )
{
delete (ScalableBloomFilter*) w.addr;
w.addr = 0;
}
bool
ScalableBloomFilter::Open(SmiRecord& valueRecord, size_t& offset,
const ListExpr typeInfo, Word& value)
{
double fp;
size_t maxInserts = 8;
size_t subFilterSize;
int nbrSubFilters;
int nbrHashFunctions;
vector<int> hashFunctionsPerFilter;
vector<bool> insertionVector;
bool filterElement;
bool ok = valueRecord.Read (&fp, sizeof(double), offset);
offset += sizeof(double);
cout << "Open FP: " << fp << endl;
ScalableBloomFilter* openBloom = new ScalableBloomFilter(fp);
ok = ok && valueRecord.Read (&nbrSubFilters, sizeof(int), offset);
offset += sizeof(int);
cout << "Open Nbr Subfilters: " << nbrSubFilters << endl;
openBloom->getFilterList().reserve(nbrSubFilters);
hashFunctionsPerFilter.reserve(nbrSubFilters);
for (int i = 0; i < (nbrSubFilters); i++) {
ok = ok && valueRecord.Read(&nbrHashFunctions, sizeof(int), offset);
hashFunctionsPerFilter.push_back(nbrHashFunctions);
offset += sizeof(int);
}
int i = 0;
cout << "Open Hashfunctions per filter: " << endl;
for (int nbr : hashFunctionsPerFilter) {
cout << "Filter " << i << " has " << nbr << " Hashes" << endl;
i++;
}
openBloom -> getFilterHashes().clear();
openBloom -> getFilterHashes().reserve(hashFunctionsPerFilter.size());
openBloom -> setFilterHashes(hashFunctionsPerFilter);
cout << "Nbr of Hashfunctions saved per Filter in OpenBloom: " << endl;
for (int nbr : openBloom -> getFilterHashes()) {
int i = 0;
cout << "Filter " << i << " Hashes: " << nbr << endl;
i++;
}
subFilterSize=openBloom->optimalSize(maxInserts, fp);
for (size_t j = 0; j < subFilterSize; j++) {
ok = ok && valueRecord.Read (&filterElement, sizeof(bool), offset);
offset += sizeof(bool);
openBloom->setElement(0,j, filterElement);
}
fp *= 0.8;
maxInserts*=2;
cout << endl;
cout << "Beginning to Copy Subfilter values" << endl;
for (int i = 1; i < nbrSubFilters; i++) {
cout << endl;
cout << "Beginning Work on Subfilter " << i << endl;
cout << endl;
subFilterSize=openBloom->optimalSize(maxInserts, fp);
cout << "Size of Subfilter " << i << " determined to be: "
<< subFilterSize << endl;
cout << endl;
insertionVector.reserve(subFilterSize);
for (size_t j = 0; j < subFilterSize; j++) {
ok = ok && valueRecord.Read (&filterElement, sizeof(bool), offset);
offset += sizeof(bool);
insertionVector.push_back(filterElement);
}
cout << "Subfilter " << i << " has the form: " << endl;
for (bool elem : insertionVector) {
cout << elem;
}
cout << endl;
cout << "Pushing insertion Vector into FilterList: " << endl;
openBloom -> setSubFilter(insertionVector);
cout << endl;
cout << "FilterList now has " <<
openBloom -> getFilterList().size() << " SubFilters" << endl;
cout << endl;
insertionVector.clear();
cout << endl;
cout << endl;
fp *= 0.8;
maxInserts*=2;
}
cout << "The opened Bloomfilter has the values: ";
int indiz = 0;
for (vector<bool> subfilter : openBloom -> getFilterList()) {
cout << endl;
cout << "Opened Subfilter " << indiz << " has the form: " << endl;
cout << endl;
for (bool filterValue : subfilter) {
cout << filterValue;
}
indiz++;
}
if (ok) {
value.addr = openBloom;
} else {
value.addr = 0;
}
return true;
}
bool
ScalableBloomFilter::Save(SmiRecord & valueRecord , size_t & offset ,
const ListExpr typeInfo , Word & value) {
ScalableBloomFilter* bloomFilter = static_cast<ScalableBloomFilter*>
(value.addr);
double fp = bloomFilter->getFP();
int nbrSubFilters = bloomFilter->getFilterList().size();
vector<int> hashfunctionsPerFilter = bloomFilter -> getFilterHashes();
cout << endl;
cout << "Saved FP: " << fp << endl;
bool ok = valueRecord.Write(&fp, sizeof(double), offset);
offset+=sizeof(double);
//The number of Filters is equivalent to the different number of
// Hashfunctions we save. Hence we only need to save one of these
// updateFilterValues
ok = ok && valueRecord.Write(&nbrSubFilters, sizeof(int), offset);
offset+=sizeof(int);
cout << "Saved Nbr Subfilters: " << nbrSubFilters << endl;
cout << "Saved Nbr of Hashfunctions per Filter: " << endl;
int i = 0;
//Save the amount of Hashfunctions each Subfilter uses
for (int nbr : hashfunctionsPerFilter) {
ok = ok && valueRecord.Write(&nbr, sizeof(int), offset);
offset+=sizeof(int);
cout << i << ":" << nbr << endl;
i++;
}
cout << endl;
i = 0;
for (vector<bool> subFilter : bloomFilter->getFilterList()) {
cout << "Subfilter " << i << ":" << endl;
for (bool elem : subFilter) {
ok = ok && valueRecord.Write(&elem, sizeof(bool), offset);
offset+=sizeof(bool);
cout << elem;
}
i++;
cout <<endl;
}
return true;
}
void
ScalableBloomFilter::Close( const ListExpr typeInfo, Word& w )
{
delete static_cast<ScalableBloomFilter*>( w.addr );
w.addr = 0;
}
Word
ScalableBloomFilter::Clone( const ListExpr typeInfo, const Word& w ) {
ScalableBloomFilter* oldFilter = (ScalableBloomFilter*) w.addr;
return SetWord( new ScalableBloomFilter(*oldFilter));
}
//Type Description
struct scalableBloomFilterInfo : ConstructorInfo {
scalableBloomFilterInfo() {
name = ScalableBloomFilter::BasicType();
signature = "-> " + Kind::SIMPLE();
typeExample = ScalableBloomFilter::BasicType();
listRep = "()";
valueExample = "(4 12 2 8)";
remarks = "";
}
};
//Creation of the Type Constructor Instance
struct scalableBloomFilterFunctions :
ConstructorFunctions<ScalableBloomFilter> {
scalableBloomFilterFunctions()
{
in = ScalableBloomFilter::In;
out = ScalableBloomFilter::Out;
create = ScalableBloomFilter::Create;
deletion = ScalableBloomFilter::Delete;
open = ScalableBloomFilter::Open;
save = ScalableBloomFilter::Save;
close = ScalableBloomFilter::Close;
clone = ScalableBloomFilter::Clone;
}
};
scalableBloomFilterInfo bi;
scalableBloomFilterFunctions bf;
TypeConstructor scalableBloomFilterTC( bi, bf );
/*
2.1.2 Class ~CountMinSketch~
*/
CountMinSketch::CountMinSketch
(const float epsilon, const float delta) {
defined = true;
eps = epsilon;
this->delta = delta;
width = ceil(exp(1)/eps);
depth = ceil(log(1/delta));
// resize rows and columns to prevent possible
// memory Fragmentation later on, since we already
// know the required number of counters
matrix.resize(depth);
for (size_t i = 0; i < depth; i++) {
matrix[i].resize(width);
}
cout << "Hashconstants vector size before resize is "
<< hashConstants.size() << endl;
hashConstants.resize(depth);
cout << "Hashconstants vector size after resize is "
<< hashConstants.size() << endl;
for (size_t i = 0; i < depth; i++) {
hashConstants[i].resize(2);
generateConstants(i);
}
}
CountMinSketch::CountMinSketch
(const CountMinSketch& rhs) {
defined = rhs.defined;
eps = rhs.eps;
delta = rhs.delta;
width = rhs.width;
depth = rhs.depth;
matrix = rhs.matrix;
hashConstants = rhs.hashConstants;
totalCount = rhs.totalCount;
}
//Setter and Getter
bool
CountMinSketch::getDefined() {
return defined;
}
size_t
CountMinSketch::getTotalCount() {
return totalCount;
}
size_t
CountMinSketch::getWidth() {
return width;
}
size_t
CountMinSketch::getDepth() {
return depth;
}
float
CountMinSketch::getEpsilon() {
return eps;
}
float
CountMinSketch::getDelta() {
return delta;
}
int
CountMinSketch::getElement(int counterNumber, int index) {
return matrix[counterNumber][index];
}
void
CountMinSketch::setElement(int counterNumber, int index, int value) {
matrix[counterNumber][index] = value;
}
vector<vector<int>>
CountMinSketch::getMatrix() {
return matrix;
}
long
CountMinSketch::getConstantA(int index) {
return hashConstants[index][0];
}
long
CountMinSketch::getConstantB(int index) {
return hashConstants[index][1];
}
void
CountMinSketch::setConstants(int counterNumber, long a, long b) {
hashConstants[counterNumber][0] = a;
hashConstants[counterNumber][1] = b;
}
vector<vector<long>>
CountMinSketch::getConstants() {
return hashConstants;
}
//Auxiliary Functions
void
CountMinSketch::initialize(float eps, float delt) {
defined = true;
this->eps = eps;
this->delta = delt;
width = ceil(exp(1)/eps);
depth = ceil(log(1/delta));
matrix.resize(depth);
for (size_t i = 0; i < depth; i++)
matrix[i].resize(width);
totalCount = 0;
// set seed for the generation of constants
// for the choice of hashfunctions from the
// pairwise independent family (ax + b % p)
srand(time(NULL)+getpid());
cout << "Hashconstants vector size in initialize before resize is "
<< hashConstants.size() << endl;
hashConstants.resize(depth);
cout << "Hashconstants vector size in initialize after resize is "
<< hashConstants.size() << endl;
for (size_t i = 0; i < depth; i++) {
hashConstants[i].resize(2);
generateConstants(i);
}
cout << "Constants in Constant Vector are: " << endl;
int j = 0;
for (vector<long> counterHashes : hashConstants) {
cout << "For Vector " << j << endl;
for (long constant : counterHashes) {
cout << constant << endl;
}
j++;
}
}
// We use the fact that pairwise independent Hashfunctions are easy
// to generate with h(x) = ax + b % p, with p being a big prime, and a
// b being constants. In this function we generate the constants.
void
CountMinSketch::generateConstants(int index) {
long a = long(float(rand())*float(LONG_PRIME)/float(RAND_MAX));
long b = long(float(rand())*float(LONG_PRIME)/float(RAND_MAX));
cout << "generateConstants() a: " << a << " b: " << b << endl;
setConstants(index, a, b);
}
void
CountMinSketch::increaseCount(long hashedEleValue) {
totalCount++;
size_t hashValue;
long a;
long b;
// Use our 2wise independent hash function
// and modulo it additionaly so that our counters
// are hit
for (size_t i = 0; i < depth; i++) {
a = getConstantA(i);
b = getConstantB(i);
hashValue = ((a*hashedEleValue+b) % LONG_PRIME) % width;
matrix[i][hashValue] = matrix[i][hashValue] + 1;
}
}
int
CountMinSketch::estimateFrequency(long hashedEleValue) {
int minVal;
int compareValue;
long a;
long b;
//use the Hashvalues of the Searchelement as Index
a = getConstantA(0);
b = getConstantB(0);
size_t hashedIndex = ((a*hashedEleValue+b) % LONG_PRIME) % width;
//Assume that the first value is the amount of times the item appeared
minVal = getElement(0, hashedIndex);
for (size_t i = 1; i < depth; i++) {
a = getConstantA(i);
b = getConstantB(i);
hashedIndex = ((a*hashedEleValue+b) % LONG_PRIME) % width;
compareValue = getElement(i, hashedIndex);
minVal = minVal < compareValue ? minVal : compareValue;
}
cout << "Found minum value to be " << minVal << endl;
return minVal;
}
//~In~/~Out~ Functions
//Currently a Dummy
//In-Function to turn List Representation into Class Representation
Word
CountMinSketch::In(const ListExpr typeInfo, const ListExpr instance,
const int errorPos, ListExpr& errorInfo, bool& correct) {
Word result = SetWord(Address(0));
correct = false;
NList list (instance);
if(list.length() != 3){
cmsg.inFunError("expected three arguments");
return result;
}
NList first = list.first();
NList second = list.second();
NList third = list.third();
NList index;
if(!first.isReal() || !second.isInt()) {
cmsg.inFunError("expected two numbers");
return result;
}
if (!third.isList()) {
cmsg.inFunError("Expected a List of Boolvalues");
}
if (third.first().isBool()) {
float fp = first.realval();
size_t inserts = second.intval();
ScalableBloomFilter* bloom = new ScalableBloomFilter(fp);
for (size_t i = 0; i < inserts; i++) {
index = third.first();
third.rest();
bloom -> getSubFilter(i)[i] = index.boolval();
}
}
return result;
}
//Out-Function (Dummy)
ListExpr
CountMinSketch::Out(ListExpr typeInfo, Word value) {
CountMinSketch* cms =
static_cast<CountMinSketch*> (value.addr);
if(!cms -> getDefined()) {
return listutils::getUndefined();
}
ListExpr elementList = nl -> OneElemList(nl->BoolAtom(0));
return elementList;
}
//Support Functions for Persistent Sorage
Word
CountMinSketch::Create( const ListExpr typeInfo )
{
Word w;
w.addr = (new CountMinSketch(0.1, 0.5));
return w;
}
void
CountMinSketch::Delete( const ListExpr typeInfo, Word& w )
{
delete (CountMinSketch*) w.addr;
w.addr = 0;
}
//Save and Open
bool
CountMinSketch::Open(SmiRecord& valueRecord, size_t& offset,
const ListExpr typeInfo, Word& value)
{
float epsilon;
float delta;
size_t width;
size_t depth;
long constantA;
long constantB;
int counterEle;
bool ok = valueRecord.Read (&epsilon, sizeof(float), offset);
offset += sizeof(float);
ok = valueRecord.Read (&delta, sizeof(float), offset);
offset += sizeof(float);
ok = ok && valueRecord.Read (&width, sizeof(size_t), offset);
offset += sizeof(size_t);
ok = ok && valueRecord.Read (&depth, sizeof(size_t), offset);
offset += sizeof(size_t);
CountMinSketch* openCMS = new CountMinSketch(epsilon, delta);
for (size_t i = 0; i < depth; i++) {
ok = ok && valueRecord.Read (&constantA, sizeof(long), offset);
offset+=sizeof(long);
ok = ok && valueRecord.Read (&constantB, sizeof(long), offset);
offset+=sizeof(long);
openCMS->setConstants(i, constantA, constantB);
}
int i = 0;
cout << "After Opening HashConstants for Counter " << i << " are: ";
for (vector<long> constants : openCMS -> getConstants()) {
for (long constant : constants) {
cout << constant << endl;
}
cout << endl;
i++;
}
for (size_t i = 0; i < depth; i++) {
for (size_t j = 0; j < width; j++) {
ok = ok && valueRecord.Read (&counterEle, sizeof(int), offset);
offset+=sizeof(int);
openCMS -> setElement(i,j,counterEle);
}
}
i = 0;
for (vector <int> counter : openCMS -> getMatrix()) {
cout << "After Opening Counter Number " << i
<< " has the following elements: " << endl;;
for (int count : counter) {
cout << count;
}
cout << endl;
cout << endl;
i++;
}
if (ok) {
value.addr = openCMS;
} else {
value.addr = 0;
}
return true;
}
bool
CountMinSketch::Save(SmiRecord & valueRecord , size_t & offset ,
const ListExpr typeInfo , Word & value) {
CountMinSketch* cms = static_cast<CountMinSketch*>
(value.addr);
float epsilon = cms->getEpsilon();
float delta = cms -> getDelta();
size_t width = cms -> getWidth();
size_t depth = cms -> getDepth();
long hashConstantA;
long hashConstantB;
int counterEle;
bool ok = valueRecord.Write(&epsilon, sizeof(float), offset);
offset+=sizeof(float);
ok = ok && valueRecord.Write(&delta, sizeof(float), offset);
offset+=sizeof(float);
ok = ok && valueRecord.Write(&width, sizeof(size_t), offset);
offset+=sizeof(size_t);
ok = ok && valueRecord.Write(&depth, sizeof(size_t), offset);
offset+=sizeof(size_t);
for (size_t i = 0; i < depth; i++) {
hashConstantA = cms ->getConstantA(i);
hashConstantB = cms -> getConstantB(i);
ok = ok && valueRecord.Write(&hashConstantA, sizeof(long), offset);
offset+=sizeof(long);
ok = ok && valueRecord.Write(&hashConstantB, sizeof(long), offset);
offset+=sizeof(long);
}
for (size_t i = 0; i < depth; i++) {
for (size_t j = 0; j < width; j++) {
counterEle = cms->getElement(i,j);
ok = ok && valueRecord.Write(&counterEle, sizeof(int), offset);
offset+=sizeof(int);
}
}
cout << endl;
return true;
}
void
CountMinSketch::Close( const ListExpr typeInfo, Word& w )
{
delete static_cast<CountMinSketch*>( w.addr );
w.addr = 0;
}
Word
CountMinSketch::Clone( const ListExpr typeInfo, const Word& w )
{
CountMinSketch* oldSketch = (CountMinSketch*) w.addr;
return SetWord( new CountMinSketch(*oldSketch));
}
//Type Description
struct countMinSketchInfo : ConstructorInfo {
countMinSketchInfo() {
name = CountMinSketch::BasicType();
signature = "-> " + Kind::SIMPLE();
typeExample = CountMinSketch::BasicType();
listRep = "()";
valueExample = "(4 12 2 8)";
remarks = "";
}
};
//Creation of the Type Constructor Instance
struct countMinSketchFunctions :
ConstructorFunctions<CountMinSketch> {
countMinSketchFunctions()
{
in = CountMinSketch::In;
out = CountMinSketch::Out;
create = CountMinSketch::Create;
deletion = CountMinSketch::Delete;
open = CountMinSketch::Open;
save = CountMinSketch::Save;
close = CountMinSketch::Close;
clone = CountMinSketch::Clone;
}
};
countMinSketchInfo ci;
countMinSketchFunctions cf;
TypeConstructor countMinSketchTC( ci, cf );
/*
2.1.3 Class ~amsSketch~
*/
amsSketch::amsSketch
(const float epsilon, const float delta) {
defined = true;
eps = epsilon;
this->delta = delta;
width = ceil(exp(1)/eps);
depth = ceil(log(1/delta));
matrix.resize(depth);
for (size_t i = 0; i < depth; i++) {
matrix[i].resize(width);
}
twConstants.resize(depth);
for (size_t i = 0; i < depth; i++) {
twConstants[i].resize(2);
generateConstants(i);
}
// The only difference between CMS and AMS is the
// requirement for a 4-wise independent Hashfamily
// the constants required are saved here
fwConstants.resize(depth);
for (size_t i = 0; i < depth; i++) {
fwConstants[i].resize(4);
generateFwConstants(i);
}
}
amsSketch::amsSketch
(const amsSketch& rhs) {
defined = rhs.defined;
eps = rhs.eps;
delta = rhs.delta;
width = rhs.width;
depth = rhs.depth;
matrix = rhs.matrix;
twConstants = rhs.twConstants;
fwConstants = rhs.fwConstants;
totalCount = rhs.totalCount;
}
//Setter and Getter
bool
amsSketch::getDefined() {
return defined;
}
size_t
amsSketch::getTotalCount() {
return totalCount;
}
size_t
amsSketch::getWidth() {
return width;
}
size_t
amsSketch::getDepth() {
return depth;
}
float
amsSketch::getEpsilon() {
return eps;
}
float
amsSketch::getDelta() {
return delta;
}
int
amsSketch::getElement(int counterNumber, int index) {
return matrix[counterNumber][index];
}
void
amsSketch::updateElement(int counterNumber, int index, int value) {
matrix[counterNumber][index] += value;
}
long
amsSketch::getConstantTwA(int index) {
return twConstants[index][0];
}
long
amsSketch::getConstantTwB(int index) {
return twConstants[index][1];
}
void
amsSketch::setConstantsTw(int index, long a, long b) {
twConstants[index][0] = a;
twConstants[index][1] = b;
}
long
amsSketch::getConstantFwA(int index) {
return fwConstants[index][0];
}
long
amsSketch::getConstantFwB(int index) {
return fwConstants[index][1];
}
long
amsSketch::getConstantFwC(int index) {
return fwConstants[index][2];
}
long
amsSketch::getConstantFwD(int index) {
return fwConstants[index][3];
}
void
amsSketch::setConstantsFw(int index, long a, long b,
long c, long d) {
fwConstants[index][0] = a;
fwConstants[index][1] = b;
fwConstants[index][2] = c;
fwConstants[index][3] = d;
}
vector<vector<long>>
amsSketch::getConstantsFw() {
return fwConstants;
}
vector<vector<long>>
amsSketch::getConstantsTw() {
return twConstants;
}
vector<vector<int>>
amsSketch::getMatrix() {
return matrix;
}
//Auxiliary Functions
void
amsSketch::initialize(float eps, float delt) {
defined = true;
this->eps = eps;
this->delta = delt;
width = ceil(exp(1)/eps);
depth = ceil(log(1/delta));
matrix.resize(depth);
for (size_t i = 0; i < depth; i++)
matrix[i].resize(width);
totalCount = 0;
srand(time(NULL)+getpid());
twConstants.resize(depth);
for (size_t i = 0; i < depth; i++) {
twConstants[i].resize(2);
generateConstants(i);
}
fwConstants.resize(depth);
for (size_t i = 0; i < depth; i++) {
fwConstants[i].resize(4);
generateFwConstants(i);
}
}
// We use the fact that pairwise independent Hashfunctions are easy
// to generate with h(x) = ax + b % p, with p being a big prime, and a
// b beign constants. In this function we generate the constants.
void
amsSketch::generateConstants(int index) {
long a = long(float(rand())*float(LONG_PRIME)/float(RAND_MAX));
long b = long(float(rand())*float(LONG_PRIME)/float(RAND_MAX));
setConstantsTw(index, a, b);
}
// In contrast to Count-Min we also need a four-wise independent hash
// Function. These are given by h(x) = ax^3 + bx^2 + cx + d % p.
void
amsSketch::generateFwConstants(int index) {
long a = long(float(rand())*float(LONG_PRIME)/float(RAND_MAX));
long b = long(float(rand())*float(LONG_PRIME)/float(RAND_MAX));
long c = long(float(rand())*float(LONG_PRIME)/float(RAND_MAX));
long d = long(float(rand())*float(LONG_PRIME)/float(RAND_MAX));
cout << "FW-Constants generated for Row " << index << " are: " << endl;
cout << "a: " << a << " b: " << b << " c: " << c << " d: " << d << endl;
cout << endl;
setConstantsFw(index, a, b, c, d);
}
// Auxiliary function in the determination of the median of
// squared sums of row values
void
amsSketch::swap(int* a, int* b)
{
int temp = *a;
*a = *b;
*b = temp;
}
int
amsSketch::partition(int arr[], int l, int r)
{
int lst = arr[r], i = l, j = l;
while (j < r) {
if (arr[j] < lst) {
swap(&arr[i], &arr[j]);
i++;
}
j++;
}
swap(&arr[i], &arr[r]);
return i;
}
int
amsSketch::randomPartition(int arr[], int l, int r)
{
int n = r - l + 1;
int pivot = rand() % n;
swap(&arr[l + pivot], &arr[r]);
return partition(arr, l, r);
}
void
amsSketch::medianDecider(int arr[], int l, int r, int k, int& a, int& b) {
// if l < r
if (l <= r) {
// Find the partition index
int partitionIndex = randomPartition(arr, l, r);
// If partion index = k, then
// we found the median of odd
// number element in medianArray[]
if (partitionIndex == k) {
b = arr[partitionIndex];
if (a != -1)
return;
}
// If index = k - 1, then we get
// a & b as middle element of
// medianArray[]
else if (partitionIndex == k - 1) {
a = arr[partitionIndex];
if (b != -1)
return;
}
// If partitionIndex >= k then
// find the index in first half
// of the medianArray[]
if (partitionIndex >= k)
return medianDecider(arr, l, partitionIndex - 1,
k, a, b);
// If partitionIndex <= k then
// find the index in second half
// of the medianArray[]
else
return medianDecider(arr, partitionIndex + 1,
r, k, a, b);
}
return;
}
int
amsSketch::findMedian(int medianArray[]) {
int a = -1;
int b = -1;
int median;
int n = *(&medianArray + 1) - medianArray;
if (n % 2 == 1) {
medianDecider(medianArray, 0, n - 1, n / 2, a, b);
median = b;
} else {
medianDecider(medianArray, 0, n - 1, n / 2, a, b);
median = (a + b) / 2;
}
return median;
}
void
amsSketch::changeWeight(size_t value) {
totalCount++;
int hashIndex;
int updateValue;
long twa, twb ,fwa, fwb, fwc, fwd;
//Extraction of the constants for the Hashfunctions for each (ith) row
for (size_t i = 0; i < depth; i++) {
twa = getConstantTwA(i);
twb = getConstantTwB(i);
fwa = getConstantFwA(i);
fwb = getConstantFwB(i);
fwc = getConstantFwC(i);
fwd = getConstantFwD(i);
//Find the Index of the Element whose counter we will change
hashIndex = ((twa*value+twb) % LONG_PRIME) % width;
//Compute the Value we will use to update the counter
updateValue = 2*((long)(fwa*pow(value,3) + (int)fwb*pow(value,2)
+ fwc*value + fwd) % LONG_PRIME % 2)- 1;
//Commit the change
updateElement(i, hashIndex, updateValue);
}
}
float
amsSketch::estimateInnerProduct() {
int medianArray[depth];
int joinSize;
int sum;
//Calculate the sum of the squares of all row Elements for each row
for (size_t i = 0; i < depth; i++) {
for (size_t j = 0; j < width; j++) {
sum += pow(getElement(i, j), 2);
}
medianArray[i] = sum;
}
joinSize = findMedian(medianArray);
//Return the Median of sum of squares of elements of the rows
return joinSize;
}
//~In~/~Out~ Functions
//In-Function to turn List Representation into Class Representation
Word
amsSketch::In(const ListExpr typeInfo, const ListExpr instance,
const int errorPos, ListExpr& errorInfo, bool& correct) {
Word result = SetWord(Address(0));
correct = false;
NList list (instance);
if(list.length() != 3){
cmsg.inFunError("expected three arguments");
return result;
}
NList first = list.first();
NList second = list.second();
NList third = list.third();
NList index;
if(!first.isReal() || !second.isInt()) {
cmsg.inFunError("expected two numbers");
return result;
}
if (!third.isList()) {
cmsg.inFunError("Expected a List of Boolvalues");
}
if (third.first().isBool()) {
float fp = first.realval();
size_t inserts = second.intval();
amsSketch* ams = new amsSketch(fp, inserts);
for (size_t i = 0; i < ams->getDelta(); i++) {
index = third.first();
third.rest();
}
}
return result;
}
//Out-Function (Dummy)
ListExpr
amsSketch::Out(ListExpr typeInfo, Word value) {
amsSketch* ams =
static_cast<amsSketch*> (value.addr);
if(!ams -> getDefined()) {
return listutils::getUndefined();
}
ListExpr elementList = nl -> OneElemList(nl->BoolAtom(0));
return elementList;
}
//Support Functions for Persistent Sorage
Word
amsSketch::Create( const ListExpr typeInfo )
{
Word w;
w.addr = (new amsSketch(0.1, 0.5));
return w;
}
void
amsSketch::Delete( const ListExpr typeInfo, Word& w )
{
delete (amsSketch*) w.addr;
w.addr = 0;
}
//Save and Open
bool
amsSketch::Open(SmiRecord& valueRecord, size_t& offset,
const ListExpr typeInfo, Word& value)
{
float epsilon;
float delta;
size_t width;
size_t depth;
long constantTwA, constantTwB, constantFwA,
constantFwB, constantFwC, constantFwD;
int counterEle;
bool ok = valueRecord.Read (&epsilon, sizeof(float), offset);
offset += sizeof(float);
ok = valueRecord.Read (&delta, sizeof(float), offset);
offset += sizeof(float);
ok = ok && valueRecord.Read (&width, sizeof(size_t), offset);
offset += sizeof(size_t);
ok = ok && valueRecord.Read (&depth, sizeof(size_t), offset);
offset += sizeof(size_t);
amsSketch* openAMS = new amsSketch(epsilon, delta);
openAMS -> getConstantsTw().clear();
openAMS -> getConstantsFw().clear();
for (size_t i = 0; i < depth; i++) {
ok = ok && valueRecord.Read (&constantTwA, sizeof(long), offset);
offset+=sizeof(long);
ok = ok && valueRecord.Read (&constantTwB, sizeof(long), offset);
offset+=sizeof(long);
openAMS->setConstantsTw(i, constantTwA, constantTwB);
ok = ok && valueRecord.Read (&constantFwA, sizeof(long), offset);
offset+=sizeof(long);
ok = ok && valueRecord.Read (&constantFwB, sizeof(long), offset);
offset+=sizeof(long);
ok = ok && valueRecord.Read (&constantFwC, sizeof(long), offset);
offset+=sizeof(long);
ok = ok && valueRecord.Read (&constantFwD, sizeof(long), offset);
offset+=sizeof(long);
openAMS ->setConstantsFw(i, constantFwA,constantFwB,
constantFwC,constantFwD);
cout << "For Counter " << i << " the opened constants Vaues are: " << endl;
cout << "TwA: " << constantTwA << " TwB: " << constantTwB << endl;
cout << "FwA: " << constantFwA << " FwB: " << constantFwB << " FwC: "
<< constantFwC << " FwD: " << constantFwD << endl;
}
for (size_t i = 0; i < depth; i++) {
cout << "Opened Counter Number " << i << " has the following elements: ";
cout << endl;
for (size_t j = 0; j < width; j++) {
ok = ok && valueRecord.Read (&counterEle, sizeof(int), offset);
offset+=sizeof(int);
openAMS -> updateElement(i,j,counterEle);
cout << counterEle;
}
cout << endl;
cout << endl;
}
if (ok) {
value.addr = openAMS;
} else {
value.addr = 0;
}
return true;
}
bool
amsSketch::Save(SmiRecord & valueRecord , size_t & offset ,
const ListExpr typeInfo , Word & value) {
amsSketch* ams = static_cast<amsSketch*>
(value.addr);
float epsilon = ams->getEpsilon();
float delta = ams -> getDelta();
size_t width = ams -> getWidth();
size_t depth = ams -> getDepth();
long hashConstantTwA, hashConstantTwB, hashConstantFwA, hashConstantFwB,
hashConstantFwC, hashConstantFwD;
int counterEle;
bool ok = valueRecord.Write(&epsilon, sizeof(float), offset);
offset+=sizeof(float);
ok = ok && valueRecord.Write(&delta, sizeof(float), offset);
offset+=sizeof(float);
ok = ok && valueRecord.Write(&width, sizeof(size_t), offset);
offset+=sizeof(size_t);
ok = ok && valueRecord.Write(&depth, sizeof(size_t), offset);
offset+=sizeof(size_t);
for (size_t i = 0; i < depth; i++) {
hashConstantTwA = ams ->getConstantTwA(i);
hashConstantTwB = ams -> getConstantTwB(i);
hashConstantFwA = ams -> getConstantFwA(i);
hashConstantFwB = ams -> getConstantFwB(i);
hashConstantFwC = ams -> getConstantFwC(i);
hashConstantFwD = ams -> getConstantFwD(i);
ok = ok && valueRecord.Write(&hashConstantTwA, sizeof(long), offset);
offset+=sizeof(long);
ok = ok && valueRecord.Write(&hashConstantTwB, sizeof(long), offset);
offset+=sizeof(long);
ok = ok && valueRecord.Write(&hashConstantFwA, sizeof(long), offset);
offset+=sizeof(long);
ok = ok && valueRecord.Write(&hashConstantFwB, sizeof(long), offset);
offset+=sizeof(long);
ok = ok && valueRecord.Write(&hashConstantFwC, sizeof(long), offset);
offset+=sizeof(long);
ok = ok && valueRecord.Write(&hashConstantFwD, sizeof(long), offset);
offset+=sizeof(long);
cout << "For Counter " << i << " the saved constants Values are: " << endl;
cout << "TwA: " << hashConstantTwA << " TwB: " << hashConstantTwB << endl;
cout << "FwA: " << hashConstantFwA << " FwB: " << hashConstantFwB << " FwC: "
<< hashConstantFwC << " FwD: " << hashConstantFwD << endl;
}
for (size_t i = 0; i < depth; i++) {
cout << "Saving Counter Values of Counter " << i << endl;
for (size_t j = 0; j < width; j++) {
counterEle = ams->getElement(i,j);
ok = ok && valueRecord.Write(&counterEle, sizeof(int), offset);
offset+=sizeof(int);
cout << counterEle;
}
cout << endl;
cout << endl;
}
cout << endl;
return true;
}
void
amsSketch::Close( const ListExpr typeInfo, Word& w )
{
delete static_cast<amsSketch*>( w.addr );
w.addr = 0;
}
Word
amsSketch::Clone( const ListExpr typeInfo, const Word& w )
{
amsSketch* oldSketch = (amsSketch*) w.addr;
return SetWord( new amsSketch(*oldSketch));
}
/*
Type Description
*/
struct amsSketchInfo : ConstructorInfo {
amsSketchInfo() {
name = amsSketch::BasicType();
signature = "-> " + Kind::SIMPLE();
typeExample = amsSketch::BasicType();
listRep = "()";
valueExample = "(4 12 2 8)";
remarks = "";
}
};
/*
Creation of the Type Constructor Instance
*/
struct amsSketchFunctions :
ConstructorFunctions<amsSketch> {
amsSketchFunctions()
{
in = amsSketch::In;
out = amsSketch::Out;
create = amsSketch::Create;
deletion = amsSketch::Delete;
open = amsSketch::Open;
save = amsSketch::Save;
close = amsSketch::Close;
clone = amsSketch::Clone;
}
};
amsSketchInfo ai;
amsSketchFunctions af;
TypeConstructor amsSketchTC( ai, af );
/*
2.1.4 Class ~lossyCounter~
*/
template<class T>
lossyCounter<T>::lossyCounter
(const float epsilon) {
defined = true;
this->epsilon = epsilon;
eleCounter = 0;
windowSize = ceil(1/epsilon);
windowIndex = 1;
}
//Setter and Getter
template<class T> bool
lossyCounter<T>::getDefined() {
return defined;
}
template<class T> void
lossyCounter<T>::setDefined(bool value) {
defined = value;
}
template<class T> int
lossyCounter<T>::getEleCounter() {
return eleCounter;
}
template<class T> float
lossyCounter<T>::getEpsilon() {
return epsilon;
}
template<class T> long
lossyCounter<T>::getCurrentWindowIndex() {
return windowIndex;
}
template<class T> int
lossyCounter<T>::getWindowSize() {
return windowSize;
}
template<class T> T
lossyCounter<T>::getElement(int index){
return frequencyList.at(index).getItem();
}
template<class T> std::unordered_map<T, counterPair<T>>
lossyCounter<T>::getFrequencyList() {
return frequencyList;
}
//Auxiliary Functions
template<class T> void
lossyCounter<T>::initialize(const float epsilon) {
defined = true;
this->epsilon = epsilon;
eleCounter = 1;
windowSize = ceil(1/epsilon);
windowIndex = 1;
frequencyList.insert({0, counterPair(0,0,0)});
}
///Handles incoming Streamelements
template<class T> void
lossyCounter<T>::addElement(T element) {
if (elementPresent(element)) {
incrCount(element);
} else {
insertElement(element);
}
if (eleCounter % windowSize == 0) {
reduce();
updateWindowIndex();
}
}
//Increase the Frequencycount of a Streamelement which
//was already present
template<class T> void
lossyCounter<T>::incrCount(T element) {
frequencyList.at(element).setFrequency();
eleCounter++;
}
//Inserts previously unencountered Elements into our element list
template<class T> void
lossyCounter<T>::insertElement(T element) {
int maxError = windowIndex-1;
//newly inserted Elements will always have Frequency 1
counterPair value(element, 1, maxError);
frequencyList.insert({element, value});
eleCounter++;
}
//Checks whether a streamelement is already present in the list of elements
template<class T> bool
lossyCounter<T>::elementPresent(T element) {
if (frequencyList.find(element) == frequencyList.end()) {
return false;
} else {
return true;
}
}
//Updates the currently used Window
template<class T> void
lossyCounter<T>::updateWindowIndex() {
windowIndex = ceil(eleCounter/windowSize);
}
//Removes the Items below the Frequency Threshold
template<class T> void
lossyCounter<T>::reduce() {
vector<T> deletionList;
for (auto elements : frequencyList) {
counterPair elem = elements.second;
if ((elem.getFrequency() + elem.getMaxError()) < windowIndex) {
deletionList.push_back(elements.first);
}
}
for (T elem : deletionList) {
frequencyList.erase(elem);
}
}
//Get the Frequent items which surpase the minsupport threshold
//Min Frequency to get returned is minSupport*eleCounter
//While max deviation is epsilon*eleCounter
template<class T> vector<T>
lossyCounter<T>::getFrequentElements(double minSupport) {
vector<T> resultList;
for (auto elements : frequencyList) {
counterPair elem = elements.second;
if (elem.frequency >= ((minSupport - epsilon) * eleCounter)) {
resultList.push_back(elem.getItem());
}
}
return resultList;
}
//Get the frequency of a single Element
template<class T> long
lossyCounter<T>::estimateCount(T elem) {
if (!(frequencyList.find(elem) == frequencyList.end())) {
counterPair elemData = frequencyList.at(elem);
long elemFrequency = elemData.getFrequency();
return elemFrequency;
} else {
return 0;
}
}
/*
2.1.4.1 Class ~counterPair~
*/
template<class T>
counterPair<T>::counterPair
(T item, long frequency, long maxError) {
this -> item = item;
this -> frequency = frequency;
this -> maxError = maxError;
}
template<class T> T
counterPair<T>::getItem() {
return item;
}
template<class T> int
counterPair<T>::getFrequency() {
return frequency;
}
template<class T> void
counterPair<T>::setFrequency() {
frequency++;
}
template<class T> int
counterPair<T>::getMaxError() {
return maxError;
}
/*
2.1.5 Class ~streamCluster~
2.1.5.1 Class ~cPoint~
*/
cPoint::cPoint(int id, string coordinates) {
pointId = id;
values = stringToVec(coordinates);
dimensions = values.size();
//Assign ID=0, because the Point will not have
//an assigned Cluster at the start
clusterId = 0;
}
int
cPoint::getDimensions() {
return dimensions;
}
int
cPoint::getCluster() {
return clusterId;
}
int
cPoint::getId() {
return pointId;
}
void
cPoint::setCluster(int index) {
clusterId = index;
}
double
cPoint::getVal(int index) {
return values[index];
}
//implement how to read data from stream
vector<double>
cPoint::stringToVec(string &coordinates) {
vector<double> dummy;
return dummy;
}
/*
2.1.5.2 Class ~Cluster~
*/
Cluster::Cluster(int id, cPoint centroid) {
clusterId = id;
for (int i = 0; i < centroid.getDimensions(); i++) {
clusterCentroid.push_back(centroid.getVal(i));
}
addPoint(centroid);
}
Cluster::Cluster()
/*
2.2 Type Mapping Functions
These functions check whether the correct argument types are supplied for an
operator; if so, returns a elementList expression for the result type, otherwise the
symbol ~typeerror~.
Type mapping for ~reservoir~ is
---- (stream T) x int -> (stream T)
Type mapping for ~tilted~ is
---- (stream T) x int -> (stream T)
Type mapping for ~bloom~ is
---- (stream(tuple(X)) x ATTR) x int x real -> bloomfilter
Type mapping for ~cbloom~ is
---- scalablebloomfilter x T -> bool
*/
/*
2.2.1 Operator ~reservoir~
*/
ListExpr
reservoirTM( ListExpr args ) {
NList type(args);
ListExpr errorInfo = nl->OneElemList(nl->SymbolAtom("ErrorInfo"));
// two arguments must be supplied
if (type.length() != 2){
return NList::typeError("Operator reservoir expects two arguments");
}
// test first argument for stream
if(!( type.first().hasLength(2)
&& type.first().first().isSymbol(sym.STREAM()))){
return NList::typeError( "Operator reservoir expects a stream "
"as first argument");
}
// test second argument for int
if(type.second() != NList(CcInt::BasicType())) {
return NList::typeError("Operator reservoir expects an int "
"as second argument");
}
// stream elements must be in kind DATA or (tuple X)
NList streamtype = type.first().second();
if( !( streamtype.hasLength(2)
&& streamtype.first().isSymbol(sym.TUPLE())
&& IsTupleDescription(streamtype.second().listExpr())
)
&& !(am->CheckKind(Kind::DATA(),streamtype.listExpr(),errorInfo))){
return NList::typeError("Operator reservoir expects a "
"stream of DATA or TUPLE.");
}
// result is the type of the first argument
return type.first().listExpr();
}
/*
2.2.2 Operator ~createbloomfilter~
*/
ListExpr
createbloomfilterTM( ListExpr args ) {
NList type(args);
NList streamtype = type.first().second();
NList appendList;
ListExpr a = nl->First(args);
ListExpr errorInfo = nl->OneElemList(nl->SymbolAtom("ErrorInfo"));
// three arguments must be supplied
if (type.length() != 3){
return NList::typeError("Operator createbloomfilter expects "
"four arguments");
}
// test first argument for being a tuple stream
if(!Stream<Tuple>::checkType(a)){
return NList::typeError( "Operator createbloomfilter expects a "
"Tuple Stream as first argument");
}
//test second argument for a valid Attribute Name
if (!type.second().isSymbol()){
return NList::typeError("Operator createbloomfilter expects a valid "
"Attribute Name as second argument");
}
// test third argument for real
if(type.third() != NList(CcReal::BasicType())) {
return NList::typeError("Operator createbloomfilter expects a real "
"value as second argument");
}
// stream elements must be in kind tuple (X) with X in DATA
if(!(streamtype.hasLength(2)
&& streamtype.first().isSymbol(sym.TUPLE())
&& IsTupleDescription(streamtype.second().listExpr())
)
&& !(am->CheckKind(Kind::DATA(),streamtype.listExpr(),errorInfo))){
return NList::typeError("Operator createbloomfilter can only handle "
"Attributetype Tuple values");
}
//extract index of the attribute we intend to hash
NList attrList = type.first().second().second();
ListExpr type2;
string attrName = type.second().str();
int attrIndex = listutils::findAttribute(attrList.listExpr(),
attrName, type2) - 1;
if (attrIndex < 0) {
return NList::typeError("Attribute " + attrName + " "
"not found in tuple");
}
/* result is a bloomfilter and we append the index of
the attribute of the tuples which will be hashed to create our filter
*/
appendList.append(NList().intAtom(attrIndex));
return NList(Symbols::APPEND(), appendList,
ScalableBloomFilter::BasicType()).listExpr();
}
/*
2.2.3 Operator ~bloomcontains~
*/
ListExpr
bloomcontainsTM(ListExpr args) {
NList type(args);
// two arguments must be supplied
if (type.length() != 2){
return NList::typeError("Operator bloomcontains expects two arguments");
}
// test first argument for scalablebloomfilter
if(type.first() != NList(ScalableBloomFilter::BasicType())){
return NList::typeError("Operator bloomcontains expects a "
"Bloomfilter as first argument");
}
// test second argument for DATA
if(type.second().isAtom()) {
return NList(CcBool::BasicType()).listExpr();
}
return NList::typeError("Operator bloomcontains expects an "
"ATTRIBUTE as second argument");
}
/*
2.2.4 Operator ~createcountmin~
*/
ListExpr
createcountminTM( ListExpr args ) {
NList type(args);
NList streamtype = type.first().second();
NList appendList;
ListExpr streamType = nl->First(args);
ListExpr errorInfo = nl->OneElemList(nl->SymbolAtom("ErrorInfo"));
// three arguments must be supplied
if (type.length() != 4){
return NList::typeError("Operator createcountmin expects "
"four arguments");
}
// test first argument for being a tuple stream
if(!Stream<Tuple>::checkType(streamType)){
return NList::typeError( "Operator createcountmin expects a "
"Tuple Stream as first argument");
}
//test second argument for a valid Attribute Name
if (!type.second().isSymbol()){
return NList::typeError("Operator createcountmin expects a valid "
"Attribute Name as second argument");
}
// test third argument for real
if(type.third() != NList(CcReal::BasicType())) {
return NList::typeError("Operator createcountmin expects a real "
"value as second argument");
}
//test fourth argument for int
if(type.fourth() != NList(CcReal::BasicType())) {
return NList::typeError("Operator createcountmin expects a real "
"value as third argument");
}
// stream elements must be in kind tuple (X) with X in DATA
if(!(streamtype.hasLength(2)
&& streamtype.first().isSymbol(sym.TUPLE())
&& IsTupleDescription(streamtype.second().listExpr())
)
&& !(am->CheckKind(Kind::DATA(),streamtype.listExpr(),errorInfo))){
return NList::typeError("Operator createcountmin can only handle "
"Attributetype Tuple values");
}
//extract index of the attribute we intend to hash
NList attrList = type.first().second().second();
ListExpr attrType;
string attrName = type.second().str();
int attrIndex = listutils::findAttribute(attrList.listExpr(),
attrName, attrType) - 1;
//Save whether the Attribute Type we have to hash is a number
//so we can modify the way we hash;
bool isNumeric = listutils::isNumericType(attrType);
if (attrIndex < 0) {
return NList::typeError("Attribute " + attrName + " "
"not found in tuple");
}
/* result is a Count Min Sketch and we append the index and type of
the attribute of the tuples which will be hashed to create our filter
*/
appendList.append(NList().intAtom(attrIndex));
appendList.append(NList().boolAtom(isNumeric));
return NList(Symbols::APPEND(), appendList,
CountMinSketch::BasicType()).listExpr();
}
/*
2.2.5 Operator ~cmscount~
*/
ListExpr
cmscountTM(ListExpr args) {
NList type(args);
NList appendList;
// two arguments must be supplied
if (type.length() != 2){
return NList::typeError("Operator cmscount expects two arguments");
}
// test first argument for scalablebloomfilter
if(type.first() != NList(CountMinSketch::BasicType())){
return NList::typeError("Operator cmscount expects a "
"Count Min Sketch as first argument");
}
// test second argument for DATA or TUPLE
if(type.second().isAtom()) {
//check if the searchelement is numeric
bool isNumeric = listutils::isNumericType(type.second().listExpr());
cout << "cms Count identifies search element as numeric: "
<< isNumeric << endl;
appendList.append(NList().boolAtom(isNumeric));
return NList(Symbols::APPEND(), appendList,
CcInt::BasicType()).listExpr();
}
return NList::typeError("Operator cmscount expects an "
"ATTRIBUTE as second argument");
}
/*
2.2.6 Operator ~createams~
*/
ListExpr
createamsTM( ListExpr args ) {
NList type(args);
NList streamtype = type.first().second();
NList appendList;
ListExpr streamType = nl->First(args);
ListExpr errorInfo = nl->OneElemList(nl->SymbolAtom("ErrorInfo"));
// three arguments must be supplied
if (type.length() != 4){
return NList::typeError("Operator createams expects "
"four arguments");
}
// test first argument for being a tuple stream
if(!Stream<Tuple>::checkType(streamType)){
return NList::typeError( "Operator createams expects a "
"Tuple Stream as first argument");
}
//test second argument for a valid Attribute Name
if (!type.second().isSymbol()){
return NList::typeError("Operator createams expects a valid "
"Attribute Name as second argument");
}
// test third argument for real
if(type.third() != NList(CcReal::BasicType())) {
return NList::typeError("Operator createams expects a real "
"value as second argument");
}
//test fourth argument for int
if(type.fourth() != NList(CcReal::BasicType())) {
return NList::typeError("Operator createams expects a real "
"value as third argument");
}
// stream elements must be in kind tuple (X) with X in DATA
if(!(streamtype.hasLength(2)
&& streamtype.first().isSymbol(sym.TUPLE())
&& IsTupleDescription(streamtype.second().listExpr())
)
&& !(am->CheckKind(Kind::DATA(),streamtype.listExpr(),errorInfo))){
return NList::typeError("Operator createams can only handle "
"Attributetype Tuple values");
}
//extract index of the attribute we intend to hash
NList attrList = type.first().second().second();
ListExpr attrType;
string attrName = type.second().str();
int attrIndex = listutils::findAttribute(attrList.listExpr(),
attrName, attrType) - 1;
//Save whether the Attribute Type we have to hash is a number
// so we can modify the way we hash;
bool isNumeric = listutils::isNumericType(attrType);
if (attrIndex < 0) {
return NList::typeError("Attribute " + attrName + " "
"not found in tuple");
}
/* result is a AMS Sketch and we append the index and type of
the attribute of the tuples which will be hashed to create our filter
*/
appendList.append(NList().intAtom(attrIndex));
appendList.append(NList().boolAtom(isNumeric));
return NList(Symbols::APPEND(), appendList,
amsSketch::BasicType()).listExpr();
}
/*
2.2.7 Operator ~amsestimate~
*/
ListExpr
amsestimateTM(ListExpr args) {
NList type(args);
NList appendList;
// two arguments must be supplied
if (type.length() != 1){
return NList::typeError("Operator amsestimate expects one arguments");
}
// test first argument for scalablebloomfilter
if(type.first() != NList(amsSketch::BasicType())){
return NList::typeError("Operator amsestimate expects a "
"AMS Sketch as argument");
}
//check if the searchelement is numeric
return NList(CcReal::BasicType()).listExpr();
}
/*
2.2.8 Operator ~createlossycounter~
*/
ListExpr
createlossycounterTM( ListExpr args ) {
NList type(args);
NList streamtype = type.first().second();
NList appendList;
ListExpr outlist = nl->TheEmptyList();
ListExpr streamType = nl->First(args);
ListExpr errorInfo = nl->OneElemList(nl->SymbolAtom("ErrorInfo"));
// three arguments must be supplied
if (type.length() != 3){
return NList::typeError("Operator createlossycounter expects "
"two arguments");
}
// test first argument for being a tuple stream
if(!Stream<Tuple>::checkType(streamType)){
return NList::typeError( "Operator createlossycounter expects a "
"Tuple Stream as first argument");
}
//test second argument for a valid Attribute Name
if (!type.second().isSymbol()){
return NList::typeError("Operator createlossycounter expects a valid "
"Attribute Name as second argument");
}
// test third argument for real
if(type.third() != NList(CcReal::BasicType())) {
return NList::typeError("Operator createlossycounter expects a real "
"value as third argument");
}
// stream elements must be in kind tuple (X) with X in DATA
if(!(streamtype.hasLength(2)
&& streamtype.first().isSymbol(sym.TUPLE())
&& IsTupleDescription(streamtype.second().listExpr())
)
&& !(am->CheckKind(Kind::DATA(),streamtype.listExpr(),errorInfo))){
return NList::typeError("Operator createlossycounter can only handle "
"Attributetype Tuple values");
}
//extract index of the attribute we intend to hash
NList attrList = type.first().second().second();
ListExpr attrType;
string attrName = type.second().str();
int attrIndex = listutils::findAttribute(attrList.listExpr(),
attrName, attrType) - 1;
if (attrIndex < 0) {
return NList::typeError("Attribute " + attrName + " "
"not found in tuple");
}
appendList.append(NList().intAtom(attrIndex));
appendList.append(NList().stringAtom(nl->ToString(attrType)));
/* Result is a Tuple Stream consisting of (Item, Frequency, Delta,
Epsilon,EleCount) and we appended attribute Index and Type
*/
outlist =
nl->TwoElemList(
nl->SymbolAtom(Symbol::STREAM()),
nl->TwoElemList(
nl->SymbolAtom(Tuple::BasicType()),
nl->FiveElemList(
nl->TwoElemList(
nl->SymbolAtom("Value"),
nl->SymbolAtom(nl->SymbolValue(attrType))
),
nl->TwoElemList(
nl->SymbolAtom("Frequency"),
nl->SymbolAtom(CcInt::BasicType())
),
nl->TwoElemList(
nl->SymbolAtom("Delta"),
nl->SymbolAtom(CcInt::BasicType())
),
nl->TwoElemList(
nl->SymbolAtom("Epsilon"),
nl->SymbolAtom(CcReal::BasicType())
),
nl->TwoElemList(
nl->SymbolAtom("EleCount"),
nl->SymbolAtom(CcInt::BasicType())
)
)
)
);
cout << endl;
cout << "Createlossycounter outlist looks like: " << endl;
cout << nl->ToString(outlist) << endl;
return NList(Symbols::APPEND(), appendList,
outlist).listExpr();
}
/*
2.2.9 Operator ~lcfrequent~
*/
ListExpr
lcfrequentTM(ListExpr args) {
NList type(args);
NList appendList;
ListExpr outlist = nl -> TheEmptyList();
// two arguments must be supplied
if (type.length() != 2){
return NList::typeError("Operator lcfrequent expects two arguments");
}
// test first argument for stream
if(!( type.first().hasLength(2)
&& type.first().first().isSymbol(sym.STREAM()))){
return NList::typeError( "Operator lcfrequent expects a stream "
"as first argument");
}
//Every Stream of LossyCounter type will consist of
//(Value, Frequency, Delta, Error, EleCount) so we check for that
string value =
nl->ToString((type.first().second().second().first().first()).listExpr());
string frq =
nl->ToString((type.first().second().second().second().first()).listExpr());
string dlt =
nl->ToString((type.first().second().second().third().first()).listExpr());
string err =
nl->ToString((type.first().second().second().fourth().first()).listExpr());
string cnt =
nl->ToString((type.first().second().second().fifth().first()).listExpr());
cout << "In lcfrequent TM Checking for the Attribute types gave me: " << endl;
cout << value << " " << frq << " " << dlt << " " << err << " " << cnt << endl;
cout << endl;
if(!( value == "Value") && (frq == "Frequency") && (dlt == "Delta") &&
(err == "Epsilon") && (cnt == "EleCount")){
return NList::typeError( "Operator lcfrequent expects a Stream generated "
"from a Lossy Counter as first argument");
}
// test second argument for real
if(type.second() != NList(CcReal::BasicType())){
return NList::typeError("Operator lcfrequent expects a "
"real value as second argument");
}
//extract the attributetype of the "Value" attribute
NList attrList = type.first().second().second();
ListExpr attrType;
string attrName = "Value";
listutils::findAttribute(attrList.listExpr(),
attrName, attrType);
appendList.append(NList().stringAtom(nl->ToString(attrType)));
outlist =
nl->TwoElemList(
nl->SymbolAtom(Symbol::STREAM()),
nl->TwoElemList(
nl->SymbolAtom(Tuple::BasicType()),
nl->ThreeElemList(
nl->TwoElemList(
nl->SymbolAtom("Value"),
nl->SymbolAtom(nl->SymbolValue(attrType))),
nl->TwoElemList(
nl->SymbolAtom("Frequency"),
nl->SymbolAtom(CcInt::BasicType())),
nl->TwoElemList(
nl->SymbolAtom("Delta"),
nl->SymbolAtom(CcInt::BasicType())
)
)
)
);
/* Result is a Tuple Stream consisting of (Item, Frequency, Delta)
and we appended attribute Index and Type
*/
cout << endl;
cout << "Return type of lcfrequent() is: " << endl;
cout << nl->ToString(NList(Symbols::APPEND(), appendList,
outlist).listExpr());
cout << endl;
return NList(Symbols::APPEND(), appendList,
outlist).listExpr();
}
/*
2.2.9 Operator ~outlier~
*/
ListExpr
outlierTM(ListExpr args) {
NList type(args);
NList stream = type.first();
NList appendList;
ListExpr outlist = nl -> TheEmptyList();
// two arguments must be supplied
if (type.length() != 3){
return NList::typeError("Operator outlier expects three arguments");
}
// test first argument for being a tuple stream
if(!Stream<Tuple>::checkType(stream.listExpr())){
return NList::typeError( "Operator outlier expects a "
"Tuple Stream as first argument");
}
//test second argument for a valid Attribute Name
if (!type.second().isSymbol()){
return NList::typeError("Operator outlier expects a valid "
"Attribute Name as second argument");
}
// test third argument for real
if(type.third() != NList(CcInt::BasicType())){
return NList::typeError("Operator outlier expects an"
"int value as third argument");
}
//extract index of the attribute we intend to hash
NList attrList = type.first().second().second();
ListExpr attrType;
string attrName = type.second().str();
int attrIndex = listutils::findAttribute(attrList.listExpr(),
attrName, attrType) - 1;
if (attrIndex < 0) {
return NList::typeError("Attribute " + attrName + " "
"not found in tuple");
}
if ((nl->ToString(attrType) != CcInt::BasicType()) &&
nl->ToString(attrType) != CcReal::BasicType()) {
return NList::typeError("Operator outlier only works with "
"int and real attributes");
}
outlist =
nl->TwoElemList(
nl->SymbolAtom(Symbol::STREAM()),
nl->TwoElemList(
nl->SymbolAtom(Tuple::BasicType()),
nl->TwoElemList(
nl->TwoElemList(
nl->SymbolAtom("Value"),
nl->SymbolAtom(nl->SymbolValue(attrType))),
nl->TwoElemList(
nl->SymbolAtom("StreamIndex"),
nl->SymbolAtom(CcInt::BasicType()))
)
)
);
cout << endl;
cout << "In outlierTM outlist has the form: " << endl;
cout << nl->ToString(outlist) << endl;
/* Result type is a stream of Tuples with the queried attribute value
and its corresponding index in the stream. We also appended
the attribute index and type for the Value Mapping
*/
appendList.append(NList().intAtom(attrIndex));
appendList.append(NList().stringAtom(nl->ToString(attrType)));
return NList(Symbols::APPEND(), appendList,
outlist).listExpr();
}
/*
2.3 Value Mapping Functions
2.3.1 Operator ~reservoir~
Creates a reservoir Sample (stream) of the passed stream.
*/
/*
Templates are used to deal with the different Types of Streams
the operator handles
*/
template<class T> class reservoirInfo{
public:
reservoirInfo(Word inputStream, size_t inputSampleSize):
stream(inputStream), sampleSize(inputSampleSize),
counter(0),lastOut(-1) {
stream.open();
init();
}
~reservoirInfo() {
for(size_t index = lastOut+1; index < reservoir.size(); index++) {
reservoir[index]->DeleteIfAllowed();
}
stream.close();
}
//Returns the Elements in the reservoir in case of requests
T* next() {
lastOut++;
if(lastOut >= (int)reservoir.size()) {
return 0;
}
T* resElement = reservoir[lastOut];
reservoir[lastOut] = 0;
return resElement;
}
private:
Stream<T> stream;
size_t sampleSize;
size_t counter;
int lastOut;
std::vector<T*> reservoir;
void init() {
T* data;
//While the Argumentstream can still supply Data/Tuples
while ((data = stream.request()) != nullptr) {
/*increase the counter to keep track of how many Arguments
have been passed by the argument stream*/
counter++;
//Decide whether to include data in the reservoir
insert(data);
}
}
//Decides whether to include Data/Tuples from the Stream into the reservoir
void insert(T* data) {
/*Initialize the reservoir with Elements until it is
filled for the first time*/
if(reservoir.size() < sampleSize) {
reservoir.push_back(data);
return;
}
/*Once the reservoir is filled use Algorithm R to
determine Replacement of Elements*/
size_t rnd = rand() % (counter+1);
if (rnd < reservoir.size()) {
reservoir[rnd]->DeleteIfAllowed();
reservoir[rnd] = data;
} else {
data -> DeleteIfAllowed();
}
}
};
template<class T>
int reservoirVMT(Word* args, Word& result,
int message, Word& local, Supplier s){
reservoirInfo<T>* li = (reservoirInfo<T>*) local.addr;
switch(message){
case OPEN: {
if(li) {
delete li;
local.addr = 0;
}
CcInt* reservoirSize = (CcInt*) args[1].addr;
if(reservoirSize->IsDefined()){
int size = reservoirSize->GetValue();
if(size>0) {
local.addr = new reservoirInfo<T>(args[0], size);
}
}
return 0;
}
case REQUEST : result.addr = li?li->next():0;
return result.addr?YIELD:CANCEL;
case CLOSE : {
if(li){
delete li;
local.addr = 0;
}
return 0;
}
}
return -1;
}
//value Mapping Array
ValueMapping reservoirVM[] = {
reservoirVMT<Tuple>,
reservoirVMT<Attribute>
};
// Selection Function
int reservoirSelect(ListExpr args){
if (Stream<Attribute>::checkType(nl->First(args))) {
return 1;
} else if (Stream<Tuple>::checkType(nl->First(args))){
return 0;
} else {
return -1;
}
}
/*
2.3.2 Operator ~createbloomfilter~
*/
int createbloomfilterVM(Word* args, Word& result,
int message, Word& local, Supplier s){
//take the parameters values supplied with the operator
CcReal* fpProb = (CcReal*) args[2].addr;
CcInt* attrIndexPointer = (CcInt*) args[3].addr;
int attrIndex = attrIndexPointer->GetIntval();
//Get the Resultstorage provided by the Query Processor
result = qp -> ResultStorage(s);
//Make the Storage provided by QP easily usable
ScalableBloomFilter* bloomFilter = (ScalableBloomFilter*) result.addr;
/* Implementieren, dass Filter Instanz gereinigt wird, wenn er bereits besteht
if (bloomFilter->getFilter.size() > 0) {
bloomFilter->Clear();
}
*/
//initialize the Filter with the values provided by the operator
bloomFilter->initialize(fpProb->GetValue());
cout << "After init() Bloom Filter Values are: " << endl;
cout << "Defined: " << bloomFilter->getDefined() << endl;
cout << "Nbr of Hashes: " << bloomFilter->getCurNumberHashes() << endl;
cout << "Hashvector saves: " << bloomFilter->getFilterHashes().size() << endl;
cout << "FP: " << + bloomFilter->getFP() << endl;
cout << "Filter currently made for Inserts " <<
bloomFilter->getCurMaxInserts() << endl;
cout << "Current Filter Size: " << bloomFilter->getCurFilterSize() << endl;
cout << "Overall having " << bloomFilter->getFilterList().size()
<< " Filters" << endl;
//Get the stream provided by the operator
Stream<Tuple> stream(args[0]);
//open the stream
stream.open();
//Pointers to stream elements will be saved here for use
Tuple* streamTuple = (Tuple*) stream.request();
Attribute* streamElement;
//Get the size of the Filter so we can %mod the hash
//results to map to an index in the filter
size_t filterSize = bloomFilter->getCurFilterSize();
//Get number of Hashfunctions so reserving the hash results
//vector will be faster
int nbrHashes = bloomFilter->getCurNumberHashes();
vector<size_t> hashvalues;
//Prepare buffer for the MurmurHash3 output storage
uint64_t mHash[2];
//while the stream can still provide elements:
while ((streamTuple != 0)) {
if (bloomFilter->isSaturated()) {
cout << "Filter " << bloomFilter->getFilterList().size() - 1
<< " is full" << endl;
cout << endl;
bloomFilter->updateFilterValues();
nbrHashes = bloomFilter->getCurNumberHashes();
filterSize = bloomFilter ->getCurFilterSize();
}
hashvalues.reserve(nbrHashes);
streamElement = (Attribute*) streamTuple->GetAttribute(attrIndex);
/*64 Bit Version chosen, because of my System.
Should we change this 64 bit? */
MurmurHash3_x64_128(streamElement, sizeof(*streamElement), 0, mHash);
size_t h1 = mHash[0] % filterSize;
hashvalues.push_back(h1);
//more than 1 Hash is needed (probably always the case)
if (nbrHashes > 1) {
size_t h2 = mHash[1] % filterSize;
hashvalues.push_back(h2);
//hash the streamelement for the appropriate number of times
for (int i = 2; i < nbrHashes; i++) {
size_t h_i = (h1 + i * h2 + i * i) % filterSize;
//order of elements is irrelevant; must only be set in the filter
hashvalues.push_back(h_i);
}
}
/*set the bits corresponding to the elements
hashed values in the bloomfilter*/
bloomFilter->add(hashvalues);
//delete old hashvalues from the vector
hashvalues.clear();
streamTuple->DeleteIfAllowed();
//assign next Element from the Stream
streamTuple = stream.request();
}
cout << endl;
cout << "Bloomfilters have the values: " << endl;
int indiz = 0;
for (vector<bool> subfilter : bloomFilter -> getFilterList()) {
cout << endl;
cout << "Subfilter " << indiz << " has the form: " << endl;
for (bool filterValue : subfilter) {
cout << filterValue;
}
indiz++;
cout << endl;
cout << endl;
}
stream.close();
result.setAddr(bloomFilter);
return 0;
}
/*
2.3.3 Operator ~bloomcontains~
*/
int bloomcontainsVM(Word* args, Word& result,
int message, Word& local, Supplier s){
bool included = false;
//take the parameters values supplied with the operator
ScalableBloomFilter* bloomFilter = (ScalableBloomFilter*) args[0].addr;
Attribute* searchEle = (Attribute*) args[1].addr;
//Get the Resultstorage provided by the Query Processor
result = qp -> ResultStorage(s);
//Make the Storage provided by QP easily usable
CcBool* res = (CcBool*) result.addr;
//Get the amount of Hashfunctions each subfilter uses
vector<int> hashIterations = bloomFilter -> getFilterHashes();
//Prepare buffer for the MurmurHash3 output storage
uint64_t cmHash[2];
//Take Size of the bloomFilter
size_t nbrOfFilters = bloomFilter -> getFilterList().size();
//prepare a vector to take in the Hashresults
vector<size_t> hashValues;
for (size_t i = 0; i < nbrOfFilters; i++) {
hashValues.clear();
hashValues.reserve(hashIterations[i]);
size_t filterSize = bloomFilter->getFilterList()[i].size();
//hash the Searchelement
MurmurHash3_x64_128(searchEle, sizeof(*searchEle), 0, cmHash);
size_t h1 = cmHash[0] % filterSize;
hashValues.push_back(h1);
size_t h2 = 1;
if (hashIterations[i] >= 2) {
h2 = cmHash[1] % filterSize;
hashValues.push_back(h2);
}
for (int j = 2; j < hashIterations[i]; j++) {
size_t h_i = (h1 + i * h2 + i * i) % filterSize;
hashValues.push_back(h_i);
}
//Element is contained in one of the Subfilters
if (bloomFilter->contains(hashValues, i)) {
included = true;
break;
}
}
res->Set(true, included);
return 0;
}
/*
2.3.4 Operator ~createcountmin~
*/
int createcountminVM(Word* args, Word& result,
int message, Word& local, Supplier s){
//take the parameters values supplied with the operator
CcReal* epsilon = (CcReal*) args[2].addr;
CcReal* delta = (CcReal*) args[3].addr;
CcInt* attrIndexPointer = (CcInt*) args[4].addr;
CcBool* attrIsNumeric = (CcBool*) args[5].addr;
int attrIndex = attrIndexPointer->GetIntval();
bool attrNumeric = attrIsNumeric->GetValue();
cout << "In the VM AttrIsNumeric has value: " << attrNumeric << endl;
//Get the Resultstorage provided by the Query Processor
result = qp -> ResultStorage(s);
//Make the Storage provided by QP easily usable
CountMinSketch* cms = (CountMinSketch*) result.addr;
//initialize the Filter with the values provided by the operator
cms->initialize(epsilon->GetValue(),delta->GetValue());
cout << "After init() CMS Values are: " << endl;
cout << endl;
cout << "Defined: " + cms->getDefined() << endl;
cout << "Epsilon: " << + cms->getEpsilon() << endl;
cout << "Delta: " << + cms->getDelta() << endl;
cout << "Width: " << endl;
cout << cms->getWidth();
cout << endl;
cout << "Depth: " << + cms->getDepth() << endl;
cout << "Total Count " << + cms->getTotalCount() << endl;
cout << "VV enthält: " << cms->getMatrix().size() << endl;
for (size_t i = 0; i < cms->getDepth(); i++) {
cout << "V" << i << " length is: " << cms->getMatrix()[i].size() << endl;
}
//Get the stream provided by the operator
Stream<Tuple> stream(args[0]);
//open the stream
stream.open();
//Pointers to stream elements will be saved here for use
Tuple* streamTuple = (Tuple*) stream.request();
Attribute* streamElement;
/*check if the Attributes we are going to hash is
any form of text
*/
if (!attrNumeric) {
cout << "Identified Attribute Type as String" << endl;
//Prepare buffer for the MurmurHash3 output storage for Strings
uint64_t mHash[2];
while ((streamTuple != 0)) {
streamElement = (Attribute*) streamTuple->GetAttribute(attrIndex);
MurmurHash3_x64_128(streamElement, sizeof(*streamElement), 0, mHash);
long h1 = mHash[0];
cms->increaseCount(h1);
streamTuple->DeleteIfAllowed();
streamTuple = stream.request();
}
}
//while the stream can still provide elements:
while ((streamTuple != 0)) {
CcInt* intElement;
intElement = (CcInt*) streamTuple->GetAttribute(attrIndex);
/*Increase element Counter in the
Count Min Sketch
*/
cms->increaseCount(intElement -> GetIntval());
streamTuple->DeleteIfAllowed();
//assign next Element from the Stream
streamTuple = stream.request();
}
for (size_t i = 0; i < cms->getDepth(); i++) {
cout << "Vector " << i << " looks like: " << endl;
for (size_t j = 0; j < cms->getWidth(); j++) {
cout << cms->getMatrix()[i][j];
}
cout << endl;
}
cout << endl;
cout << "Total Elements processed: " << cms->getTotalCount();
cout << "----------------------------------------------" << endl;
cout << endl;
stream.close();
result.setAddr(cms);
return 0;
}
/*
2.3.5 Operator ~cmscount~
*/
int cmscountVM(Word* args, Word& result,
int message, Word& local, Supplier s){
//take the parameters values supplied with the operator
CountMinSketch* cms = (CountMinSketch*) args[0].addr;
CcInt* searchEle = (CcInt*) args[1].addr;
CcInt* attrIsNumeric = (CcInt*) args[2].addr;
bool attrNumeric = attrIsNumeric->GetValue();
//Get the Resultstorage provided by the Query Processor
result = qp -> ResultStorage(s);
//Make the Storage provided by QP easily usable
CcInt* res = (CcInt*) result.addr;
//prepare the result
int estimate = 0;
//Prepare buffer for the MurmurHash3 output storage
uint64_t cmHash[2];
//Search element is a string and we have to modulate
//the way we hash slightly
if (!attrNumeric) {
cout << "cmsCount identified the Searchelement as Text" << endl;
MurmurHash3_x64_128(searchEle, sizeof(*searchEle), 0, cmHash);
long h1 = cmHash[0];
estimate = cms->estimateFrequency(h1);
} else {
//hash the Searchelement
estimate = cms->estimateFrequency(searchEle->GetValue());
}
res->Set(true, estimate);
return 0;
}
/*
2.3.5 Operator ~createams~
*/
int createamsVM(Word* args, Word& result,
int message, Word& local, Supplier s){
//take the parameters values supplied with the operator
CcReal* epsilon = (CcReal*) args[2].addr;
CcReal* delta = (CcReal*) args[3].addr;
CcInt* attrIndexPointer = (CcInt*) args[4].addr;
CcBool* attrIsNumeric = (CcBool*) args[5].addr;
int attrIndex = attrIndexPointer->GetIntval();
bool attrNumeric = attrIsNumeric->GetValue();
//Get the Resultstorage provided by the Query Processor
result = qp -> ResultStorage(s);
/*CMS Datastructure is used since CMS and AMS only differ
in the way they use Hashfunctions to process updates.
*/
amsSketch* ams = (amsSketch*) result.addr;
//initialize the Filter with the values provided by the operator
ams->initialize(epsilon->GetValue(),delta->GetValue());
cout << "After init() AMS Values are: " << endl;
cout << endl;
cout << "Defined: " + ams->getDefined() << endl;
cout << "Epsilon: " << + ams->getEpsilon() << endl;
cout << "Delta: " << + ams->getDelta() << endl;
cout << "Width: " << endl;
cout << ams->getWidth();
cout << endl;
cout << "Depth: " << + ams->getDepth() << endl;
cout << "Total Count " << + ams->getTotalCount() << endl;
cout << "VV enthält: " << ams->getMatrix().size() << endl;
for (size_t i = 0; i < ams->getDepth(); i++) {
cout << "V" << i << " length is: " << ams->getMatrix()[i].size() << endl;
}
//Get the stream provided by the operator
Stream<Tuple> stream(args[0]);
//open the stream
stream.open();
//Pointers to stream elements will be saved here for use
Tuple* streamTuple = (Tuple*) stream.request();
Attribute* streamElement;
if (!attrNumeric) {
cout << "Identified Attribute Type as String" << endl;
//Prepare buffer for the MurmurHash3 output storage for Strings
uint64_t mHash[2];
while ((streamTuple != 0)) {
streamElement = (Attribute*) streamTuple->GetAttribute(attrIndex);
MurmurHash3_x64_128(streamElement, sizeof(*streamElement), 0, mHash);
long h1 = mHash[0];
ams->changeWeight(h1);
streamTuple->DeleteIfAllowed();
streamTuple = stream.request();
}
}
//while the stream can still provide elements:
while ((streamTuple != 0)) {
CcInt* intElement;
intElement = (CcInt*) streamTuple->GetAttribute(attrIndex);
ams->changeWeight(intElement -> GetValue());
streamTuple->DeleteIfAllowed();
//assign next Element from the Stream
streamTuple = stream.request();
}
for (size_t i = 0; i < ams->getDepth(); i++) {
cout << "Vector " << i << " looks like: " << endl;
for (size_t j = 0; j < ams->getWidth(); j++) {
cout << ams->getMatrix()[i][j];
}
cout << endl;
}
cout << endl;
cout << "Total Elements processed: " << ams->getTotalCount();
cout << "----------------------------------------------" << endl;
cout << endl;
stream.close();
result.setAddr(ams);
return 0;
}
/*
2.3.6 Operator ~amsestimate~
*/
int amsestimateVM(Word* args, Word& result,
int message, Word& local, Supplier s){
//take the parameters values supplied with the operator
amsSketch* ams = (amsSketch*) args[0].addr;
//Get the Resultstorage provided by the Query Processor
result = qp -> ResultStorage(s);
//Make the Storage provided by QP easily usable
CcReal* res = (CcReal*) result.addr;
//prepare the result
float estimate = ams -> estimateInnerProduct();
res->Set(true, estimate);
return 0;
}
/*
2.3.7 Operator ~createlossycounter~
*/
template<class T, class S>
class createlossycounterInfo{
public:
createlossycounterInfo(Word inputStream, float epsilon, int index,
string type):
stream(inputStream), error(epsilon), attrIndex(index), attrType(type),
lastOut(-1), counter(epsilon){
stream.open();
typeList = nl->TwoElemList(
nl->SymbolAtom(Tuple::BasicType()),
nl->FiveElemList(
nl->TwoElemList(
nl->SymbolAtom("Value"),
nl->SymbolAtom(attrType)
),
nl->TwoElemList(
nl->SymbolAtom("Frequency"),
nl->SymbolAtom(CcInt::BasicType())
),
nl->TwoElemList(
nl->SymbolAtom("Delta"),
nl->SymbolAtom(CcInt::BasicType())
),
nl->TwoElemList(
nl->SymbolAtom("Error"),
nl->SymbolAtom(CcReal::BasicType())
),
nl->TwoElemList(
nl->SymbolAtom("Element Count"),
nl->SymbolAtom(CcInt::BasicType())
)
)
);
SecondoCatalog* sc = SecondoSystem::GetCatalog();
numTypeList = sc->NumericType(typeList);
tupleType = new TupleType(numTypeList);
init();
}
~createlossycounterInfo() {
for(size_t index = lastOut+1; index < frequencyList.size(); index++) {
frequencyList[index]->DeleteIfAllowed();
}
tupleType -> DeleteIfAllowed();
stream.close();
}
Tuple* next() {
lastOut++;
if (lastOut >= (int) frequencyList.size()) {
return 0;
}
Tuple* frequentItem = frequencyList[lastOut];
frequencyList[lastOut] = 0;
return frequentItem;
}
private:
Stream<Tuple> stream;
float error;
int attrIndex;
string attrType;
int lastOut;
TupleType* tupleType;
ListExpr typeList;
ListExpr numTypeList;
lossyCounter<S> counter;
vector<Tuple*> frequencyList;
void init() {
Tuple* oldTuple;
while ((oldTuple = stream.request()) != nullptr) {
T* attrValue = (T*) oldTuple->GetAttribute(attrIndex);
S value = attrValue ->GetValue();
counter.addElement(value);
oldTuple -> DeleteIfAllowed();
}
output();
}
void output() {
cout << endl;
cout << "Created Counter holds the values: " << endl;
for (auto elements : counter.getFrequencyList()) {
counterPair<S> elem = elements.second;
Tuple* newTuple = new Tuple(tupleType);
S value = elem.getItem();
T* attrValue = new T(true, value);
int frequency = elem.getFrequency();
CcInt* attrFrequency = new CcInt(true, frequency);
int delta = elem.getMaxError();
CcInt* attrDelta = new CcInt(true, delta);
CcReal* attrError = new CcReal(true, error);
int eleCount = counter.getEleCounter();
CcInt* attrCount = new CcInt(true, eleCount);
cout << "Value: " << value << " Frequency: " << frequency
<< " Delta: " << delta << " Error: " << error
<< " Element Count: " << eleCount << endl;
//Generate the output tuple
newTuple->PutAttribute(0, attrValue);
newTuple->PutAttribute(1, attrFrequency);
newTuple->PutAttribute(2, attrDelta);
newTuple->PutAttribute(3, attrError);
newTuple->PutAttribute(4, attrCount);
frequencyList.push_back(newTuple);
}
}
};
template<class T, class S>
int createlossycounterVMT(Word* args, Word& result,
int message, Word& local, Supplier s){
createlossycounterInfo<T,S>* lc = (createlossycounterInfo<T,S>*) local.addr;
switch(message){
case OPEN: {
if(lc) {
delete lc;
local.addr = 0;
}
CcReal* epsilon = (CcReal*) args[2].addr;
CcInt* attrIndex = (CcInt*) args[3].addr;
CcString* attrType = (CcString*) args[4].addr;
if(epsilon->IsDefined()){
float error = epsilon->GetValue();
int index = attrIndex -> GetValue();
string type = attrType ->GetValue();
cout << endl;
if((error < 1) && (error > 0)) {
local.addr = new createlossycounterInfo<T,S>(args[0],
error, index, type);
}
}
return 0;
}
case REQUEST :
result.addr = lc?lc->next():0;
return result.addr?YIELD:CANCEL;
case CLOSE : {
if(lc){
delete lc;
local.addr = 0;
}
return 0;
}
}
return -1;
}
//value Mapping Array
ValueMapping createlossycounterVM[] = {
createlossycounterVMT<CcInt,int>,
createlossycounterVMT<CcReal,float>,
createlossycounterVMT<CcString,string>,
createlossycounterVMT<CcBool,bool>
};
// Selection Function
int createlossycounterSelect(ListExpr args){
NList type(args);
NList attrList = type.first().second().second();
ListExpr attrType;
string attrName = type.second().str();
listutils::findAttribute(attrList.listExpr(),
attrName, attrType);
cout << endl;
cout << "LossyCounterSelection identified AttrType as: "
<< nl->ToString(attrType) << endl;
cout << endl;
if (nl->ToString(attrType) == CcInt::BasicType()) {
cout << "Returned Int" << endl;
cout << endl;
return 0;
} else if (nl->ToString(attrType) == CcReal::BasicType()){
cout << "Returned Real" << endl;
cout << endl;
return 1;
} else if (nl->ToString(attrType) == CcString::BasicType()){
cout << "Returned String" << endl;
cout << endl;
return 2;
} else if (nl->ToString(attrType) == CcBool::BasicType()) {
cout << "Returned bool" << endl;
cout << endl;
return 3;
} else {
return -1;
}
}
/*
2.3.8 Operator ~lcfrequent~
*/
template<class T>
class lcFrequentInfo{
public:
lcFrequentInfo(Word inputStream, float minSup, string type):
stream(inputStream), minSupport(minSup), attrType (type),
lastOut(-1){
stream.open();
typeList = nl->TwoElemList(
nl->SymbolAtom(Tuple::BasicType()),
nl->ThreeElemList(
nl->TwoElemList(
nl->SymbolAtom("Value"),
nl->SymbolAtom(attrType)
),
nl->TwoElemList(
nl->SymbolAtom("Frequency"),
nl->SymbolAtom(CcInt::BasicType())
),
nl->TwoElemList(
nl->SymbolAtom("Delta"),
nl->SymbolAtom(CcInt::BasicType())
)
)
);
cout << endl;
cout << "TupleType generated in lcfrequentinfo() looks like: " << endl;
cout << nl->ToString(typeList) << endl;
cout << endl;
cout << "AttrType in lcfrequentinfo() is: " << attrType << endl;
SecondoCatalog* sc = SecondoSystem::GetCatalog();
numTypeList = sc->NumericType(typeList);
tupleType = new TupleType(numTypeList);
init();
}
~lcFrequentInfo() {
for(size_t index = lastOut+1; index < aboveMinSup.size(); index++) {
aboveMinSup[index]->DeleteIfAllowed();
}
tupleType -> DeleteIfAllowed();
stream.close();
}
Tuple* getNext() {
lastOut++;
if(lastOut >= (int)aboveMinSup.size()) {
return 0;
}
Tuple* frequentElement = aboveMinSup[lastOut];
aboveMinSup[lastOut] = 0;
return frequentElement;
}
private:
Stream<Tuple> stream;
float minSupport;
string attrType;
int lastOut;
float epsilon;
int nbrElements;
vector<Tuple*> aboveMinSup;
TupleType* tupleType;
ListExpr typeList;
ListExpr numTypeList;
void init() {
Tuple* oldTuple = stream.request();
epsilon = ((CcReal*) oldTuple->GetAttribute(3))->GetValue();
nbrElements = ((CcInt*) oldTuple->GetAttribute(4))->GetValue();
int frequency = ((CcInt*) oldTuple->GetAttribute(1))->GetValue();
if (isFrequent(frequency)) {
Tuple* newTuple = new Tuple(tupleType);
T* attrValue = (T*) oldTuple->GetAttribute(0);
//Since we are going to delete the old tuple we have to make sure
//the reference count of the Attributes we want to carry over into
//the new tuple does not go to 0.
attrValue -> Copy();
CcInt* attrFrequency = (CcInt*)oldTuple->GetAttribute(1);
attrFrequency -> Copy();
CcInt* attrDelta = (CcInt*) oldTuple->GetAttribute(2);
attrDelta -> Copy();
newTuple->PutAttribute(0, attrValue);
newTuple->PutAttribute(1, attrFrequency);
newTuple->PutAttribute(2, attrDelta);
aboveMinSup.push_back(newTuple);
oldTuple->DeleteIfAllowed();
} else {
oldTuple->DeleteIfAllowed();
}
while ((oldTuple = stream.request()) != nullptr) {
int frequency = ((CcInt*) oldTuple->GetAttribute(1))->GetValue();
if (isFrequent(frequency)) {
Tuple* newTuple = new Tuple(tupleType);
T* attrValue = (T*) oldTuple->GetAttribute(0);
attrValue -> Copy();
CcInt* attrFrequency = (CcInt*) oldTuple->GetAttribute(1);
attrFrequency -> Copy();
CcInt* attrDelta = (CcInt*) oldTuple->GetAttribute(2);
attrDelta -> Copy();
newTuple->PutAttribute(0, attrValue);
newTuple->PutAttribute(1, attrFrequency);
newTuple->PutAttribute(2, attrDelta);
aboveMinSup.push_back(newTuple);
oldTuple->DeleteIfAllowed();
} else {
oldTuple->DeleteIfAllowed();
}
}
}
bool isFrequent(int frequency) {
return (((minSupport-epsilon)*nbrElements) <= frequency);
}
};
template<class T>
int lcfrequentVMT(Word* args, Word& result,
int message, Word& local, Supplier s){
lcFrequentInfo<T>* freqInf = (lcFrequentInfo<T>*) local.addr;
switch(message){
case OPEN: {
if(freqInf) {
delete freqInf;
local.addr = 0;
}
CcReal* minSupport = (CcReal*) args[1].addr;
CcString* attrType = (CcString*) args[2].addr;
if (minSupport -> IsDefined()) {
float minSup = minSupport->GetValue();
string type = attrType->GetValue();
cout << "LcFrequentVMT AttrType: " << type << endl;
if ((minSup > 0) && (minSup <= 1)) {
local.addr = new
lcFrequentInfo<T>(args[0].addr, minSup, type);
}
}
return 0;
}
case REQUEST: result.addr = freqInf?freqInf->getNext():0;
return result.addr?YIELD:CANCEL;
case CLOSE: {
if(freqInf){
delete freqInf;
local.addr = 0;
}
return 0;
}
}
return -1;
}
//value Mapping Array
ValueMapping lcfrequentVM[] = {
lcfrequentVMT<CcInt>,
lcfrequentVMT<CcReal>,
lcfrequentVMT<CcString>,
lcfrequentVMT<CcBool>
};
// Selection Function
int lcfrequentSelect(ListExpr args){
NList type(args);
NList attrList = type.first().second().second();
ListExpr attrType;
string attrName = "Value";
listutils::findAttribute(attrList.listExpr(),
attrName, attrType);
cout << endl;
cout << "lcfrequent identified AttrType as: "
<< nl->ToString(attrType) << endl;
cout << endl;
if (nl->ToString(attrType) == CcInt::BasicType()) {
cout << "Returned Int" << endl;
cout << endl;
return 0;
} else if (nl->ToString(attrType) == CcReal::BasicType()){
cout << "Returned Real" << endl;
cout << endl;
return 1;
} else if (nl->ToString(attrType) == CcString::BasicType()){
cout << "Returned String" << endl;
cout << endl;
return 2;
} else if (nl->ToString(attrType) == CcBool::BasicType()) {
cout << "Returned bool" << endl;
cout << endl;
return 3;
} else {
return -1;
}
}
/*
2.3.9 Operator ~outlier~
*/
template<class T, class S>
class outlierInfo{
public:
outlierInfo(Word inputStream, int zScoreInput, int index, string type):
stream(inputStream), zThreshhold(zScoreInput), attrIndex(index),
attrType(type), mean(0), variance(0), lastOut(-1), counter(0){
stream.open();
//We appended the attribute Type in the TM and use it to create our
//output tuple
typeList = nl->TwoElemList(
nl->SymbolAtom(Tuple::BasicType()),
nl->TwoElemList(
nl->TwoElemList(
nl->SymbolAtom("Value"),
nl->SymbolAtom(attrType)
),
nl->TwoElemList(
nl->SymbolAtom("StreamIndex"),
nl->SymbolAtom(CcInt::BasicType())
)
)
);
SecondoCatalog* sc = SecondoSystem::GetCatalog();
numTypeList = sc->NumericType(typeList);
tupleType = new TupleType(numTypeList);
init();
}
~outlierInfo() {
for(size_t index = lastOut+1; index < outlierHistory.size(); index++) {
outlierHistory[index]->DeleteIfAllowed();
tupleType -> DeleteIfAllowed();
}
stream.close();
}
Tuple* next() {
lastOut++;
if (lastOut >= (int) outlierHistory.size()) {
return 0;
}
Tuple* outlier = outlierHistory[lastOut];
outlierHistory[lastOut] = 0;
return outlier;
}
private:
Stream<Tuple> stream;
int zThreshhold;
int attrIndex;
string attrType;
double mean;
double variance;
int lastOut;
size_t counter;
vector<Tuple*> outlierHistory;
TupleType* tupleType;
ListExpr typeList;
ListExpr numTypeList;
void init() {
Tuple* oldTuple;
Tuple* newTuple = new Tuple(tupleType);
oldTuple = stream.request();
T* attrValue = (T*) oldTuple->GetAttribute(attrIndex);
CcInt* index = new CcInt(true, counter);
//We will consider the first Tuple Value to be an outlier since we
//have no way of knowing what the stream will actually look like
newTuple -> PutAttribute(0, attrValue);
newTuple -> PutAttribute(1, index);
outlierHistory.push_back(newTuple);
//We use the second template type to be able to handle the values
S value = (S) attrValue->GetValue();
updateData(value);
while ((oldTuple = stream.request()) != nullptr) {
Tuple* newTuple = new Tuple(tupleType);
T* attrValue = (T*) oldTuple->GetAttribute(attrIndex);
CcInt* index = new CcInt(true, counter);
S value = attrValue->GetValue();
if (checkData(value)) {
newTuple -> PutAttribute(0, attrValue);
newTuple -> PutAttribute(1, index);
outlierHistory.push_back(newTuple);
if (outlierHistory.size() == 1589) {
cout << endl;
cout << "Outlier history now contains: " << endl;
for (auto elem : outlierHistory) {
T* attrValue = (T*) elem->GetAttribute(0);
CcInt* attrIndex = (CcInt*) elem->GetAttribute(1);
S attr = attrValue -> GetValue();
int index = attrIndex -> GetValue();
cout << "Value: " << attr << " Index: " << index << endl;
cout << endl;
}
}
} else {
oldTuple -> DeleteIfAllowed();
}
updateData(value);
}
}
/*Check whether the currently handled Streamelements
z-Score surpases our treshholds and save it if it does
*/
bool checkData(S data) {
int zscore;
if (data == 0) {
zscore = 0;
} else {
zscore = (data - mean)/sqrt(variance);
}
if((zscore < (-1*zThreshhold)) || (zscore > zThreshhold)) {
return true;
}
return false;
}
//Update mean, variance and the counter
void updateData(S data) {
//Calculate the first part of the variance for the i+1th element
variance = ((variance + pow(mean, 2)) * counter +
pow(data, 2))/(counter+1);
//Calculate the mean for the i+1th element
mean = ((mean * counter) + data)/(counter+1);
//finish the calculation of the variance
variance = variance - pow(mean,2);
counter++;
}
};
template<class T, class S> int
outlierVMT(Word* args, Word& result,
int message, Word& local, Supplier s){
outlierInfo<T,S>* outlier = (outlierInfo<T,S>*) local.addr;
switch(message){
case OPEN: {
if(outlier) {
delete outlier;
local.addr = 0;
}
CcInt* threshold = (CcInt*) args[2].addr;
CcInt* attrIndex = (CcInt*) args[3].addr;
CcString* attrType = (CcString*) args[4].addr;
if(threshold->IsDefined()){
int zThreshold = threshold->GetValue();
int index = attrIndex -> GetValue();
string type = attrType ->GetValue();
cout << endl;
if(zThreshold>1) {
local.addr = new outlierInfo<T,S>(args[0], zThreshold,
index, type);
}
}
return 0;
}
case REQUEST :
result.addr = outlier?outlier->next():0;
return result.addr?YIELD:CANCEL;
case CLOSE : {
if(outlier){
delete outlier;
local.addr = 0;
}
return 0;
}
}
return -1;
}
//value Mapping Array
ValueMapping outlierVM[] = {
outlierVMT<CcInt, int>,
outlierVMT<CcReal, float>
};
// Selection Function
int outlierSelect(ListExpr args){
NList type(args);
NList attrList = type.first().second().second();
ListExpr attrType;
string attrName = type.second().str();
listutils::findAttribute(attrList.listExpr(),
attrName, attrType);
if (nl->ToString(attrType) == CcInt::BasicType()) {
return 0;
} else if (nl->ToString(attrType) == CcReal::BasicType()){
return 1;
} else {
return -1;
}
}
/*
2.4 Description of Operators
*/
OperatorSpec reservoirSpec(
"stream(T) x int -> stream(T), T = TUPLE or T = DATA",
"_ reservoir [_] ",
"Creates a reservoir sample of a supplied stream of a given size ",
"query intstream(1,1000) reservoir[10] count"
);
OperatorSpec createbloomfilterSpec(
"stream(tuple(X)) x ATTR x real -> bloomfilter",
"_ createbloomfilter [_,_,_]",
"Creates a Bloomfilter of a supplied stream with the given ",
"False Positive rate for the expected number of inserts",
"query Kinos feed createbloomfilter[Name,0.01,100] bloomcontains[\"Astor\"]"
);
OperatorSpec bloomcontainsSpec(
"scalablebloomfilter x T -> bool, T = TUPLE or T = DATA",
"_ bloomcontains [_]",
"Checks for the presence of Element T in a supplied Bloomfilter",
"query Kinos feed createbloomfilter[Name,0.01,100] bloomcontains[\"Astor\"]"
);
OperatorSpec createcountminSpec(
"stream(tuple(X)) x ATTR x int x real -> countminsketch",
"_ createcountminSpec [_,_,_]",
"Creates Count Mint Sketch for the supplied stream",
"query Kinos feed createcountmin[Name,0.01,0.9] cmscount[\"Astor\"]"
);
OperatorSpec cmscountSpec(
"countminsketch x T -> bool, T = TUPLE or T = DATA",
"_ cmscountSpec [_]",
"Gives an estimate of how often an Element appeared in the Stream the ",
"Count Min Sketch was created for"
"query Kinos feed createcountmin[Name,0.01,0.9] cmscount[\"Astor\"]"
);
OperatorSpec createamsSpec(
"stream(tuple(X)) x ATTR x int x real -> amssketch",
"_ createamsSpec [_,_,_]",
"Creates an AMS Sketch fpor the supplied stream",
"query Kinos feed createams[Name,0.01,0.9] amsestimate"
);
OperatorSpec amsestimateSpec(
"amssketch -> real",
"_ amsestimate ",
"Creates and estimate of the F_2 Moment of the ",
"given AMS-Sketch"
);
OperatorSpec createlossycounterSpec(
"stream(tuple(X)) x ATTR x real -> lossycounter",
"_ createlossycounter [_,_]",
"Creates a lossy Counter the supplied stream",
"query intstream(0,100) createlossycounter lcfrequent"
);
OperatorSpec lcfrequentSpec(
"lossycounter(X) -> stream(ATTR)",
"_ lcfrequent [_]",
"Displays the items determined to be frequent by the lossy Counter",
"query intstream(0,100) createlossycounter lcfrequent"
);
OperatorSpec outlierSpec(
"stream(T) x real -> stream(T), T = int or T = real",
"_ outlier [_]",
"Determines outliers for an int/real stream according to the ",
"passed Z-Score Threshold.",
"query intstream(0,100) outliers[1] consume"
);
/*
2.5 Operator Instances
*/
Operator reservoirOp(
"reservoir",
reservoirSpec.getStr(),
2,
reservoirVM,
reservoirSelect,
reservoirTM
);
Operator createbloomfilterOp(
"createbloomfilter",
createbloomfilterSpec.getStr(),
createbloomfilterVM,
Operator::SimpleSelect,
createbloomfilterTM
);
Operator bloomcontainsOp(
"bloomcontains",
bloomcontainsSpec.getStr(),
bloomcontainsVM,
Operator::SimpleSelect,
bloomcontainsTM
);
Operator createcountminOp(
"createcountmin",
createcountminSpec.getStr(),
createcountminVM,
Operator::SimpleSelect,
createcountminTM
);
Operator cmscountOp(
"cmscount",
cmscountSpec.getStr(),
cmscountVM,
Operator::SimpleSelect,
cmscountTM
);
Operator createamsOp(
"createams",
createamsSpec.getStr(),
createamsVM,
Operator::SimpleSelect,
createamsTM
);
Operator amsestimateOp(
"amsestimate",
amsestimateSpec.getStr(),
amsestimateVM,
Operator::SimpleSelect,
amsestimateTM
);
Operator createlossycounterOp(
"createlossycounter",
createlossycounterSpec.getStr(),
4,
createlossycounterVM,
createlossycounterSelect,
createlossycounterTM
);
Operator lcfrequentOp(
"lcfrequent",
lcfrequentSpec.getStr(),
4,
lcfrequentVM,
lcfrequentSelect,
lcfrequentTM
);
Operator outlierOp(
"outlier",
outlierSpec.getStr(),
2,
outlierVM,
outlierSelect,
outlierTM
);
/*
2.6 The algebra class
*/
class StreamMiningAlgebra : public Algebra
{
public:
StreamMiningAlgebra() : Algebra()
{
//Reigstration of Types
AddTypeConstructor(&scalableBloomFilterTC);
AddTypeConstructor(&countMinSketchTC);
AddTypeConstructor(&amsSketchTC);
//Usage possibilities of the Types
scalableBloomFilterTC.AssociateKind(Kind::SIMPLE());
countMinSketchTC.AssociateKind(Kind::SIMPLE());
amsSketchTC.AssociateKind(Kind::SIMPLE());
//Registration of Operators
AddOperator(&reservoirOp);
AddOperator(&createbloomfilterOp);
AddOperator(&bloomcontainsOp);
AddOperator(&createcountminOp);
AddOperator(&cmscountOp);
AddOperator(&createamsOp);
AddOperator(&amsestimateOp);
AddOperator(&createlossycounterOp);
AddOperator(&lcfrequentOp);
AddOperator(&outlierOp);
}
~StreamMiningAlgebra() {};
};
} // end of namespace eschbach
/*
3 Initialization
*/
extern "C"
Algebra*
InitializeStreamMiningAlgebra( NestedList* nlRef, QueryProcessor* qpRef )
{
nl = nlRef;
qp = qpRef;
return (new eschbach::StreamMiningAlgebra);
}