Files
secondo/Algebras/AssociationAnalysis/GenTransactions.cpp

240 lines
8.5 KiB
C++
Raw Normal View History

2026-01-23 17:03:45 +08:00
/*
----
This file is part of SECONDO.
Copyright (C) 2021, University in Hagen, Department of Computer Science,
Database Systems for New Applications.
SECONDO is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
SECONDO is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SECONDO; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
----
//paragraph [1] Title: [{\Large \bf \begin {center}] [\end {center}}]
[1] Association Analysis Algebra Implementation
January 2021 - April 2021, P. Fedorow for bachelor thesis.
*/
#include "GenTransactions.h"
#include "Algebras/Collection/IntSet.h"
#include "Algebras/Relation-C++/RelationAlgebra.h" // rel, trel, tuple
#include "Common.h"
#include "NList.h"
#include "NestedList.h"
#include "StandardTypes.h"
namespace AssociationAnalysis {
// Returns the list representation of the tuple type that is used for the
// generated transactions: tuple(Id: int, Itemset: intset)
ListExpr genTransactionsTupleType() {
NList attrs = NList(
NList(NList().symbolAtom("Id"), NList().symbolAtom(CcInt::BasicType())),
NList(NList().symbolAtom("Itemset"),
NList().symbolAtom(collection::IntSet::BasicType())));
ListExpr type = NList().tupleOf(attrs).listExpr();
return type;
}
// Type mapping for the genTransactions operator.
ListExpr genTransactionsTM(ListExpr args) {
NList type(args);
if (type.length() == 5) {
for (int i = 1; i <= 5; i += 1) {
const NList &arg = type.elem(i);
if (!arg.first().isSymbol(CcInt::BasicType()) ||
arg.second().intval() < 1) {
return NList::typeError("Argument number " + std::to_string(i) +
" must be of type int and >= 1.");
}
}
} else {
return NList::typeError("5 arguments expected but " +
std::to_string(type.length()) + " received.");
}
NList tupleType = NList(genTransactionsTupleType());
return NList().streamOf(tupleType).listExpr();
}
// Value mapping for the genTransactions operator.
int genTransactionsVM(Word *args, Word &result, int message, Word &local,
Supplier s) {
auto *li = (getTransactionsLI *)local.addr;
switch (message) {
case OPEN: {
delete li;
local.addr = new getTransactionsLI(
((CcInt *)args[0].addr)->GetIntval(), // numOfTransactions
((CcInt *)args[1].addr)->GetIntval(), // transactionSizeMean
((CcInt *)args[2].addr)->GetIntval(), // frequentItemsetSizeMean
((CcInt *)args[3].addr)->GetIntval(), // numOfFrequentItemsets
((CcInt *)args[4].addr)->GetIntval() // numOfItems
);
return 0;
}
case REQUEST:
result.addr = li ? li->getNext() : nullptr;
return result.addr ? YIELD : CANCEL;
case CLOSE:
delete li;
local.addr = nullptr;
return 0;
default:
return 0;
}
}
// Prepares the set of potentially frequent itemsets and everything else that is
// needed for the generation of transactions.
getTransactionsLI::getTransactionsLI(size_t numOfTransactions,
size_t transactionSizeMean,
size_t frequentItemsetSizeMean,
size_t numOfFrequentItemsets,
size_t numOfItems)
: numOfTransactions(numOfTransactions), t(0),
genTransactionSize((int)transactionSizeMean - 1),
allowOversizedTransaction(false) {
// Prepare the random number distributions.
std::poisson_distribution<int> genFrequentItemsetSize(
(int)frequentItemsetSizeMean - 1);
std::uniform_int_distribution<int> genItem(1, numOfItems);
std::exponential_distribution<float> genReuseFraction(2);
std::exponential_distribution<double> genWeight(1);
this->potentialFrequentItemsets.reserve(numOfFrequentItemsets);
// Generate the first frequent itemset.
{
size_t size = genFrequentItemsetSize(this->gen) + 1;
std::vector<int> itemset;
itemset.reserve(size);
for (size_t i = 0; i < size; i += 1) {
itemset.push_back(genItem(this->gen));
}
this->potentialFrequentItemsets.push_back(itemset);
}
// Generate the rest of the frequent itemsets.
for (size_t i = 0; i < numOfFrequentItemsets; i += 1) {
size_t size = genFrequentItemsetSize(this->gen) + 1;
std::vector<int> itemset;
// Reuse items from the previously generated itemset.
std::vector<int> &prevItemset = this->potentialFrequentItemsets[i];
shuffle(prevItemset.begin(), prevItemset.end(), this->gen);
size_t reuseNum = std::round(std::min(genReuseFraction(this->gen), 1.0f) *
prevItemset.size());
for (size_t j = 0; j < reuseNum; j += 1) {
itemset.push_back(prevItemset[j]);
}
// Fill the itemset with random items until the desired size is
// reached.
while (itemset.size() < size) {
itemset.push_back(genItem(this->gen));
}
this->potentialFrequentItemsets.push_back(itemset);
}
// Setup random number distribution for the itemset selection.
std::vector<double> itemsetWeights;
itemsetWeights.reserve(numOfFrequentItemsets);
for (size_t i = 0; i < numOfFrequentItemsets; i += 1) {
itemsetWeights.push_back(genWeight(this->gen));
}
this->genPotentialFrequentItemset.~discrete_distribution();
new (&this->genPotentialFrequentItemset)
std::discrete_distribution(itemsetWeights.begin(), itemsetWeights.end());
// Setup corruption levels.
std::normal_distribution randCorruptionLevel(0.5, (double)std::sqrt(0.1L));
this->corruptionLevels.reserve(numOfFrequentItemsets);
for (size_t i = 0; i < numOfFrequentItemsets; i += 1) {
this->corruptionLevels.push_back(
std::clamp(randCorruptionLevel(this->gen), 0.0, 1.0));
}
// Setup resulting tuple type.
this->tupleType = new TupleType(
SecondoSystem::GetCatalog()->NumericType(transactionsTupleType()));
}
// Returns the next generated transaction as a tuple.
Tuple *getTransactionsLI::getNext() {
std::uniform_real_distribution<double> randCorruptionLevel(0, 1);
if (this->t < this->numOfTransactions) {
size_t transactionSize = this->genTransactionSize(this->gen) + 1;
collection::IntSet transaction(true);
// Fill transaction with items from the potential frequent itemsets
// till the desired transaction size is reached.
size_t count = 0; // counter to ensure termination
while (transaction.getSize() < transactionSize &&
count < this->potentialFrequentItemsets.size()) {
count += 1;
int index = this->genPotentialFrequentItemset(this->gen);
std::vector<int> &itemset = this->potentialFrequentItemsets[index];
// Create a corrupted itemset.
shuffle(itemset.begin(), itemset.end(), this->gen);
auto end = itemset.cend();
while (end != itemset.cbegin() &&
(randCorruptionLevel(this->gen) < this->corruptionLevels[index])) {
end -= 1;
}
collection::IntSet corruptedItemset(std::set<int>(itemset.cbegin(), end));
if (transaction.add(corruptedItemset).getSize() > transactionSize) {
if (transaction.getSize() == 0 &&
transactionSize < corruptedItemset.getSize() &&
!this->allowOversizedTransaction) {
// We would end up with an empty transaction here,
// because the generated transaction is still empty and
// the itemset that we want to add is larger than the
// desired transaction size. To prevent this we continue
// with the next itemset.
continue;
} else {
// Allow an oversized transaction in half the cases.
if (this->allowOversizedTransaction) {
transaction = transaction.add(corruptedItemset);
}
this->allowOversizedTransaction = !this->allowOversizedTransaction;
}
break;
} else {
transaction = transaction.add(corruptedItemset);
}
}
auto tuple = new Tuple(this->tupleType);
tuple->PutAttribute(0, new CcInt((int)this->t));
tuple->PutAttribute(1, new collection::IntSet(transaction));
this->t += 1;
return tuple;
} else {
return nullptr;
}
}
} // namespace AssociationAnalysis