609 lines
22 KiB
C++
609 lines
22 KiB
C++
/*
|
|
----
|
|
This file is part of SECONDO.
|
|
|
|
Copyright (C) 2021, University in Hagen, Department of Computer Science,
|
|
Database Systems for New Applications.
|
|
|
|
SECONDO is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
SECONDO is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with SECONDO; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
----
|
|
|
|
//paragraph [1] Title: [{\Large \bf \begin {center}] [\end {center}}]
|
|
|
|
[1] Association Analysis Algebra Implementation
|
|
|
|
January 2021 - April 2021, P. Fedorow for bachelor thesis.
|
|
|
|
*/
|
|
|
|
#include "Apriori.h"
|
|
|
|
#include "Common.h"
|
|
|
|
#include "StandardTypes.h"
|
|
|
|
#include <iterator>
|
|
#include <memory>
|
|
|
|
namespace AssociationAnalysis {
|
|
|
|
enum Deoptimize {
|
|
NoTransactionBitmap = 1 << 0,
|
|
NoHashTree = 1 << 1,
|
|
NoPruning = 1 << 2,
|
|
NoTriangularMatrix = 1 << 3,
|
|
};
|
|
|
|
// Implementation of a bitmap which represents an itemset. Each bit corresponds
|
|
// to an item.
|
|
class ItemsetBitmap {
|
|
public:
|
|
// Inserts the given item into the itemset.
|
|
void insert(unsigned long item) {
|
|
unsigned long index = ItemsetBitmap::index(item);
|
|
if (index >= this->bitmap.size()) {
|
|
this->bitmap.resize(index + 1);
|
|
}
|
|
this->bitmap[index] |= ItemsetBitmap::mask(item);
|
|
}
|
|
|
|
// Returns true if the itemset contains the the given item.
|
|
[[nodiscard]] bool contains(unsigned long item) const {
|
|
unsigned long index = ItemsetBitmap::index(item);
|
|
if (index < this->bitmap.size()) {
|
|
return this->bitmap[index] & ItemsetBitmap::mask(item);
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Resets the itemset but keeps the memory allocated to allow efficient reuse.
|
|
void reset() {
|
|
for (unsigned long &chunk : this->bitmap) {
|
|
chunk = 0;
|
|
}
|
|
}
|
|
|
|
private:
|
|
// Each item corresponds to a bit in the bitmap. Item k is stored in the k-th
|
|
// bit (counting up from the least significant bit).
|
|
std::vector<unsigned long> bitmap;
|
|
|
|
// Computes the index into the bitmap vector for the given item.
|
|
static unsigned long index(unsigned long item) {
|
|
return item / (sizeof(unsigned long) * 8);
|
|
}
|
|
|
|
// Computes the mask for the given item.
|
|
static unsigned long mask(unsigned long item) {
|
|
return 1ul << (item % (sizeof(unsigned long) * 8));
|
|
}
|
|
};
|
|
|
|
// Inserts the itemset into the hash tree. All inserted itemsets have to be of
|
|
// the same size.
|
|
void ItemsetHashTree::insert(const std::vector<int> &itemset) {
|
|
this->insert(itemset, 0, 0);
|
|
}
|
|
|
|
// Returns all itemsets that potentially can be part of the given transaction.
|
|
std::vector<std::reference_wrapper<const std::vector<int>>>
|
|
ItemsetHashTree::subset(const collection::IntSet *transaction) const {
|
|
std::vector<std::reference_wrapper<const std::vector<int>>> itemsets;
|
|
this->subset(transaction, 0, 0, itemsets);
|
|
return itemsets;
|
|
}
|
|
|
|
// Returns true if the hash-tree is empty.
|
|
bool ItemsetHashTree::empty() const {
|
|
// The hash-tree starts with an empty leaf node.
|
|
return this->nodes[0].isLeaf && this->nodes[0].itemsets.empty();
|
|
}
|
|
|
|
// Inserts the given itemset into the given node.
|
|
//
|
|
// If the node is not a leaf the function will recurse down the tree until a
|
|
// leaf is reached and the insertion can happen. When recursion occurs the depth
|
|
// parameter is incremented, so the depth parameter keeps track of the depth
|
|
// at which the node is located in the hash-tree.
|
|
void ItemsetHashTree::insert(const std::vector<int> &itemset, size_t node,
|
|
size_t depth) {
|
|
// The hash-tree can only grow to a depth of itemset size + 1.
|
|
assert(depth <= itemset.size());
|
|
|
|
if (!this->nodes[node].isLeaf) {
|
|
// The current node is not a leaf so we can't insert the itemset here. We
|
|
// have to insert the itemset into one of its children. At depth k of the
|
|
// hash-tree we hash the itemsets k-th item to determine the child.
|
|
size_t child = itemset[depth] % CHILDREN_NUM;
|
|
this->insert(itemset, this->nodes[node].children[child], depth + 1);
|
|
} else {
|
|
// The current node is a leaf so we can insert the itemset here.
|
|
this->nodes[node].itemsets.push_back(itemset);
|
|
|
|
// If the itemset capacity of the node (specified by ITEMSET_THRESHOLD) is
|
|
// reached we break the node up by moving the contained itemsets into new
|
|
// children-nodes.
|
|
//
|
|
// If the hash-tree reached its maximal depth (itemset size + 1) at the
|
|
// current node no further break-up can occur.
|
|
if (this->nodes[node].itemsets.size() == ITEMSET_THRESHOLD &&
|
|
depth < itemset.size()) {
|
|
|
|
// Create new nodes and mark them as children of the current node.
|
|
this->nodes[node].children.resize(CHILDREN_NUM);
|
|
this->nodes.resize(this->nodes.size() + CHILDREN_NUM);
|
|
for (size_t i = 0; i < CHILDREN_NUM; i += 1) {
|
|
this->nodes[node].children[i] = this->nodes.size() - CHILDREN_NUM + i;
|
|
}
|
|
|
|
// Before we insert the itemsets into their new nodes, we must safe the
|
|
// itemsets vector. This is necessary to prevent invalidation problems
|
|
// (the nodes vector can grow while insertion and trigger a reallocation)
|
|
// while we iterate over the itemsets vector. We safe the itemsets vector
|
|
// by moving it into a local variable.
|
|
std::vector<std::vector<int>> itemsets(move(this->nodes[node].itemsets));
|
|
|
|
// Insert the itemsets into their new homes.
|
|
for (const std::vector<int> &itemset : itemsets) {
|
|
// As we are depth k of the hash-tree we hash the itemsets k-th item
|
|
// to determine the child-node for the itemset.
|
|
size_t child = itemset[depth] % CHILDREN_NUM;
|
|
this->insert(itemset, this->nodes[node].children[child], depth + 1);
|
|
}
|
|
|
|
// This node is no longer a leaf now.
|
|
this->nodes[node].isLeaf = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Collects all itemsets that potentially can be part of the given transaction.
|
|
//
|
|
// To do this the hash-tree is traversed top-down and each child that might
|
|
// contain an itemset that is part of the given transaction is visited.
|
|
void ItemsetHashTree::subset(
|
|
const collection::IntSet *transaction, size_t node, size_t depth,
|
|
std::vector<std::reference_wrapper<const std::vector<int>>> &itemsets)
|
|
const {
|
|
if (this->nodes[node].isLeaf) {
|
|
// The current node is a leaf collect all contained itemsets.
|
|
for (const std::vector<int> &itemset : this->nodes[node].itemsets) {
|
|
itemsets.emplace_back(itemset);
|
|
}
|
|
} else {
|
|
// The current node is not a leaf so we have to visit all children that
|
|
// might contain an itemset that is part of the given transaction.
|
|
|
|
// The depth at which we are in the hash-tree determines how many items of
|
|
// the transaction we already handled and therefore can ignore. If we are
|
|
// the depth k we can ignore the first k items of the transaction and have
|
|
// to visit all children that could contain an itemset with one of the
|
|
// transactions remaining items. So we hash the remaining items to figure
|
|
// out which children to visit specifically.
|
|
std::vector<bool> visit(CHILDREN_NUM, false);
|
|
for (size_t i = depth; i < transaction->getSize(); i += 1) {
|
|
visit[transaction->get(i) % CHILDREN_NUM] = true;
|
|
}
|
|
|
|
// Visit the children to collect itemsets that potentially can be part of
|
|
// the given transaction.
|
|
for (size_t child = 0; child < CHILDREN_NUM; child += 1) {
|
|
if (visit[child]) {
|
|
this->subset(transaction, this->nodes[node].children[child], depth + 1,
|
|
itemsets);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Generates candidates of size k+1 given frequent itemsets of size k.
|
|
template <class Candidates>
|
|
Candidates genCandidates(const std::set<std::vector<int>> &prevFrequentItemsets,
|
|
bool noPruning) {
|
|
if (noPruning) {
|
|
Candidates candidates;
|
|
for (const std::vector<int> &itemset1 : prevFrequentItemsets) {
|
|
for (const std::vector<int> &itemset2 : prevFrequentItemsets) {
|
|
if (&itemset1 == &itemset2) {
|
|
continue;
|
|
}
|
|
auto last1 = --itemset1.cend();
|
|
auto last2 = --itemset2.cend();
|
|
if (*last1 < *last2) {
|
|
auto it1 = itemset1.cbegin();
|
|
auto it2 = itemset2.cbegin();
|
|
bool match = true;
|
|
while (it1 != last1 && it2 != last2) {
|
|
if (*it1 != *it2) {
|
|
match = false;
|
|
break;
|
|
}
|
|
it1++;
|
|
it2++;
|
|
}
|
|
if (match) {
|
|
std::vector<int> itemset;
|
|
itemset.reserve(itemset1.size() + 1);
|
|
auto it = itemset1.cbegin();
|
|
while (it != itemset1.cend() && *it < *last2) {
|
|
it++;
|
|
}
|
|
itemset.insert(itemset.cend(), itemset1.cbegin(), it);
|
|
itemset.push_back(*last2);
|
|
itemset.insert(itemset.cend(), it, itemset1.cend());
|
|
if constexpr (std::is_same<Candidates, ItemsetHashTree>::value) {
|
|
candidates.insert(itemset);
|
|
} else {
|
|
candidates.push_back(itemset);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return candidates;
|
|
} else {
|
|
std::vector<std::vector<int>> potentialCandidates;
|
|
// join step
|
|
for (const std::vector<int> &itemset1 : prevFrequentItemsets) {
|
|
for (const std::vector<int> &itemset2 : prevFrequentItemsets) {
|
|
if (&itemset1 == &itemset2) {
|
|
continue;
|
|
}
|
|
auto last1 = --itemset1.cend();
|
|
auto last2 = --itemset2.cend();
|
|
if (*last1 < *last2) {
|
|
auto it1 = itemset1.cbegin();
|
|
auto it2 = itemset2.cbegin();
|
|
bool match = true;
|
|
while (it1 != last1 && it2 != last2) {
|
|
if (*it1 != *it2) {
|
|
match = false;
|
|
break;
|
|
}
|
|
it1++;
|
|
it2++;
|
|
}
|
|
if (match) {
|
|
std::vector<int> itemset;
|
|
itemset.reserve(itemset1.size() + 1);
|
|
auto it = itemset1.cbegin();
|
|
while (it != itemset1.cend() && *it < *last2) {
|
|
it++;
|
|
}
|
|
itemset.insert(itemset.cend(), itemset1.cbegin(), it);
|
|
itemset.push_back(*last2);
|
|
itemset.insert(itemset.cend(), it, itemset1.cend());
|
|
potentialCandidates.push_back(itemset);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// prune step
|
|
Candidates candidates;
|
|
for (const std::vector<int> &candidate : potentialCandidates) {
|
|
bool prune = false;
|
|
for (size_t i = 0; i < candidate.size(); i += 1) {
|
|
std::vector<int> subset(candidate.size() - 1);
|
|
for (size_t j = 0; j < subset.size(); j += 1) {
|
|
if (j < i) {
|
|
subset[j] = candidate[j];
|
|
} else {
|
|
subset[j] = candidate[j + 1];
|
|
}
|
|
}
|
|
if (prevFrequentItemsets.count(subset) == 0) {
|
|
prune = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!prune) {
|
|
if constexpr (std::is_same<Candidates, ItemsetHashTree>::value) {
|
|
candidates.insert(candidate);
|
|
} else {
|
|
candidates.push_back(candidate);
|
|
}
|
|
}
|
|
}
|
|
return candidates;
|
|
}
|
|
}
|
|
|
|
// Finds all frequent itemsets that satisfy the support given by minSupport.
|
|
// The itemset of a transaction is extracted from each tuple of the relation
|
|
// by an index given by itemsetAttr.
|
|
aprioriLI::aprioriLI(GenericRelation *relation, int minSupport, int itemsetAttr,
|
|
int deoptimize) {
|
|
int transactionCount = relation->GetNoTuples();
|
|
|
|
int k = 0;
|
|
std::vector<std::set<std::vector<int>>> prevFrequentItemsets;
|
|
prevFrequentItemsets.resize(2);
|
|
|
|
if (deoptimize & Deoptimize::NoTriangularMatrix) {
|
|
// Generate the frequent 1-Itemsets.
|
|
// Count how many transactions contain any given item.
|
|
std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
|
|
Tuple *t;
|
|
std::unordered_map<int, int> counts;
|
|
while ((t = rit->GetNextTuple()) != nullptr) {
|
|
auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
|
|
for (int i = 0; i < (int)transaction->getSize(); i += 1) {
|
|
counts[transaction->get(i)] += 1;
|
|
}
|
|
t->DeleteIfAllowed();
|
|
}
|
|
// Add any item as an 1-itemset to the frequent itemsets if it satisfies
|
|
// the minimum support.
|
|
for (auto const &[item, count] : counts) {
|
|
if (count >= minSupport) {
|
|
std::vector<int> itemset = {item};
|
|
prevFrequentItemsets[1].insert(itemset);
|
|
double support = (double)count / (double)transactionCount;
|
|
this->frequentItemsets.emplace_back(itemset, support);
|
|
}
|
|
}
|
|
k = 2;
|
|
} else {
|
|
// Generate the frequent 1-Itemsets and 2-Itemsets.
|
|
// Count how many transactions contain any given item and any given item
|
|
// pair.
|
|
TriangularMatrix triangularMatrix;
|
|
std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
|
|
Tuple *t;
|
|
std::unordered_map<int, int> counts;
|
|
while ((t = rit->GetNextTuple()) != nullptr) {
|
|
auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
|
|
for (int i = 0; i < (int)transaction->getSize(); i += 1) {
|
|
counts[transaction->get(i)] += 1;
|
|
// Insert all 2-itemsets that are part of the transaction into the
|
|
// triangular matrix.
|
|
for (size_t j = i + 1; j < transaction->getSize(); j += 1) {
|
|
triangularMatrix.insert(transaction->get(i), transaction->get(j));
|
|
}
|
|
}
|
|
t->DeleteIfAllowed();
|
|
}
|
|
// Add any item as an 1-itemset to the frequent itemsets if it satisfies
|
|
// the minimum support.
|
|
for (auto const &[item, count] : counts) {
|
|
if (count >= minSupport) {
|
|
std::vector<int> itemset = {item};
|
|
prevFrequentItemsets[1].insert(itemset);
|
|
double support = (double)count / (double)transactionCount;
|
|
this->frequentItemsets.emplace_back(itemset, support);
|
|
}
|
|
}
|
|
// Find the frequent 2-itemsets by pairing up frequent items and checking
|
|
// their support by consulting the triangular matrix.
|
|
prevFrequentItemsets.resize(3);
|
|
auto cend = prevFrequentItemsets[1].cend();
|
|
for (auto it1 = prevFrequentItemsets[1].cbegin(); it1 != cend; it1++) {
|
|
for (auto it2 = std::next(it1); it2 != cend; it2++) {
|
|
int item1 = (*it1)[0];
|
|
int item2 = (*it2)[0];
|
|
if (triangularMatrix.count(item1, item2) >= minSupport) {
|
|
double support = (double)triangularMatrix.count(item1, item2) /
|
|
(double)transactionCount;
|
|
std::vector<int> itemset(
|
|
{std::min(item1, item2), std::max(item1, item2)});
|
|
prevFrequentItemsets[2].insert(itemset);
|
|
this->frequentItemsets.emplace_back(itemset, support);
|
|
}
|
|
}
|
|
}
|
|
k = 3;
|
|
}
|
|
|
|
if ((deoptimize & Deoptimize::NoHashTree) &&
|
|
(deoptimize & Deoptimize::NoTransactionBitmap)) {
|
|
for (int size = k; !prevFrequentItemsets[size - 1].empty(); size += 1) {
|
|
auto candidates = genCandidates<std::vector<std::vector<int>>>(
|
|
prevFrequentItemsets[size - 1], deoptimize & Deoptimize::NoPruning);
|
|
if (candidates.empty()) {
|
|
break;
|
|
}
|
|
// Count how many transactions contain any given candidate.
|
|
std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
|
|
Tuple *t;
|
|
std::map<std::vector<int>, int> counts;
|
|
while ((t = rit->GetNextTuple()) != nullptr) {
|
|
auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
|
|
for (const std::vector<int> &candidate : candidates) {
|
|
bool containsCandidate = true;
|
|
for (int item : candidate) {
|
|
// Check if the item is contained by binary search.
|
|
if (!transaction->contains(item)) {
|
|
containsCandidate = false;
|
|
break;
|
|
}
|
|
}
|
|
if (containsCandidate) {
|
|
counts[candidate] += 1;
|
|
}
|
|
}
|
|
t->DeleteIfAllowed();
|
|
}
|
|
// Add any candidate to the frequent itemsets if it satisfies the
|
|
// minimum support.
|
|
prevFrequentItemsets.resize(size + 1);
|
|
for (auto const &[itemset, count] : counts) {
|
|
if (count >= minSupport) {
|
|
prevFrequentItemsets[size].insert(itemset);
|
|
double support = (double)count / (double)transactionCount;
|
|
this->frequentItemsets.emplace_back(itemset, support);
|
|
}
|
|
}
|
|
}
|
|
} else if ((deoptimize & Deoptimize::NoHashTree) &&
|
|
!(deoptimize & Deoptimize::NoTransactionBitmap)) {
|
|
ItemsetBitmap transactionBitmap;
|
|
for (int size = k; !prevFrequentItemsets[size - 1].empty(); size += 1) {
|
|
auto candidates = genCandidates<std::vector<std::vector<int>>>(
|
|
prevFrequentItemsets[size - 1], deoptimize & Deoptimize::NoPruning);
|
|
if (candidates.empty()) {
|
|
break;
|
|
}
|
|
// Count how many transactions contain any given candidate.
|
|
std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
|
|
Tuple *t;
|
|
std::map<std::vector<int>, int> counts;
|
|
while ((t = rit->GetNextTuple()) != nullptr) {
|
|
auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
|
|
transactionBitmap.reset();
|
|
for (size_t i = 0; i < transaction->getSize(); i += 1) {
|
|
transactionBitmap.insert(transaction->get(i));
|
|
}
|
|
for (const std::vector<int> &candidate : candidates) {
|
|
bool containsCandidate = true;
|
|
for (int item : candidate) {
|
|
// Check if the item is contained in the transaction by a bitmap
|
|
// lookup.
|
|
if (!transactionBitmap.contains(item)) {
|
|
containsCandidate = false;
|
|
break;
|
|
}
|
|
}
|
|
if (containsCandidate) {
|
|
counts[candidate] += 1;
|
|
}
|
|
}
|
|
t->DeleteIfAllowed();
|
|
}
|
|
// Add any candidate to the frequent itemsets if it satisfies the
|
|
// minimum support.
|
|
prevFrequentItemsets.resize(size + 1);
|
|
for (auto const &[itemset, count] : counts) {
|
|
if (count >= minSupport) {
|
|
prevFrequentItemsets[size].insert(itemset);
|
|
double support = (double)count / (double)transactionCount;
|
|
this->frequentItemsets.emplace_back(itemset, support);
|
|
}
|
|
}
|
|
}
|
|
} else if (!(deoptimize & Deoptimize::NoHashTree) &&
|
|
(deoptimize & Deoptimize::NoTransactionBitmap)) {
|
|
for (int size = k; !prevFrequentItemsets[size - 1].empty(); size += 1) {
|
|
auto candidates = genCandidates<ItemsetHashTree>(
|
|
prevFrequentItemsets[size - 1], deoptimize & Deoptimize::NoPruning);
|
|
if (candidates.empty()) {
|
|
break;
|
|
}
|
|
// Count how many transactions contain any given candidate.
|
|
std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
|
|
Tuple *t;
|
|
std::map<std::vector<int>, int> counts;
|
|
while ((t = rit->GetNextTuple()) != nullptr) {
|
|
auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
|
|
for (const std::vector<int> &candidate :
|
|
candidates.subset(transaction)) {
|
|
bool containsCandidate = true;
|
|
for (int item : candidate) {
|
|
// Check if the item is contained by binary search.
|
|
if (!transaction->contains(item)) {
|
|
containsCandidate = false;
|
|
break;
|
|
}
|
|
}
|
|
if (containsCandidate) {
|
|
counts[candidate] += 1;
|
|
}
|
|
}
|
|
t->DeleteIfAllowed();
|
|
}
|
|
// Add any candidate to the frequent itemsets if it satisfies the
|
|
// minimum support.
|
|
prevFrequentItemsets.resize(size + 1);
|
|
for (auto const &[itemset, count] : counts) {
|
|
if (count >= minSupport) {
|
|
prevFrequentItemsets[size].insert(itemset);
|
|
double support = (double)count / (double)transactionCount;
|
|
this->frequentItemsets.emplace_back(itemset, support);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
ItemsetBitmap transactionBitmap;
|
|
for (int size = k; !prevFrequentItemsets[size - 1].empty(); size += 1) {
|
|
auto candidates = genCandidates<ItemsetHashTree>(
|
|
prevFrequentItemsets[size - 1], deoptimize & Deoptimize::NoPruning);
|
|
if (candidates.empty()) {
|
|
break;
|
|
}
|
|
// Count how many transactions contain any given candidate.
|
|
std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
|
|
Tuple *t;
|
|
std::map<std::vector<int>, int> counts;
|
|
while ((t = rit->GetNextTuple()) != nullptr) {
|
|
auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
|
|
if (!(deoptimize & Deoptimize::NoTransactionBitmap)) {
|
|
transactionBitmap.reset();
|
|
for (size_t i = 0; i < transaction->getSize(); i += 1) {
|
|
transactionBitmap.insert(transaction->get(i));
|
|
}
|
|
}
|
|
for (const std::vector<int> &candidate :
|
|
candidates.subset(transaction)) {
|
|
bool containsCandidate = true;
|
|
for (int item : candidate) {
|
|
// Check if the item is contained in the transaction by a bitmap
|
|
// lookup.
|
|
if (!transactionBitmap.contains(item)) {
|
|
containsCandidate = false;
|
|
break;
|
|
}
|
|
}
|
|
if (containsCandidate) {
|
|
counts[candidate] += 1;
|
|
}
|
|
}
|
|
t->DeleteIfAllowed();
|
|
}
|
|
// Add any candidate to the frequent itemsets if it satisfies the
|
|
// minimum support.
|
|
prevFrequentItemsets.resize(size + 1);
|
|
for (auto const &[itemset, count] : counts) {
|
|
if (count >= minSupport) {
|
|
prevFrequentItemsets[size].insert(itemset);
|
|
double support = (double)count / (double)transactionCount;
|
|
this->frequentItemsets.emplace_back(itemset, support);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
this->it = this->frequentItemsets.cbegin();
|
|
|
|
// Setup resulting tuple type.
|
|
this->tupleType = new TupleType(
|
|
SecondoSystem::GetCatalog()->NumericType(frequentItemsetTupleType()));
|
|
}
|
|
|
|
// Returns the next frequent itemset as a tuple.
|
|
Tuple *aprioriLI::getNext() {
|
|
if (this->it != this->frequentItemsets.cend()) {
|
|
auto &[itemset, support] = *this->it;
|
|
auto tuple = new Tuple(this->tupleType);
|
|
tuple->PutAttribute(0, new collection::IntSet(std::set<int>(
|
|
itemset.cbegin(), itemset.cend())));
|
|
tuple->PutAttribute(1, new CcReal(support));
|
|
this->it++;
|
|
return tuple;
|
|
} else {
|
|
return nullptr;
|
|
}
|
|
}
|
|
} // namespace AssociationAnalysis
|