1063 lines
34 KiB
C++
1063 lines
34 KiB
C++
/*
|
|
----
|
|
This file is part of SECONDO.
|
|
|
|
Copyright (C) 2021, University in Hagen, Department of Computer Science,
|
|
Database Systems for New Applications.
|
|
|
|
SECONDO is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
SECONDO is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with SECONDO; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
----
|
|
|
|
//paragraph [1] Title: [{\Large \bf \begin {center}] [\end {center}}]
|
|
|
|
[1] Association Analysis Algebra Implementation
|
|
|
|
January 2021 - April 2021, P. Fedorow for bachelor thesis.
|
|
|
|
*/
|
|
|
|
#include "FPGrowth.h"
|
|
|
|
#include "Common.h"
|
|
|
|
#include "StandardTypes.h"
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <optional>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
namespace AssociationAnalysis {
|
|
// Implementation of an FP-Tree. It is used to efficiently mine frequent
|
|
// itemsets.
|
|
class FPTreeInMemory : public FPTreeImpl<FPTreeInMemory, std::size_t> {
|
|
public:
|
|
explicit FPTreeInMemory(int transactionCount)
|
|
: _transactionCount(transactionCount), nodes(1) {}
|
|
|
|
int transactionCount() { return _transactionCount; }
|
|
|
|
private:
|
|
// Represents a node of the FP-Tree.
|
|
struct Node {
|
|
// The item that the node is holding.
|
|
int item;
|
|
|
|
// The support count of the itemset that is built of the path from to this
|
|
// node.
|
|
int count;
|
|
|
|
// Children indexes.
|
|
std::vector<std::size_t> children;
|
|
|
|
// Parent index.
|
|
std::size_t parent;
|
|
|
|
// Index to the next node that holds the same item.
|
|
std::size_t link;
|
|
};
|
|
|
|
// Represents a row in the header table.
|
|
struct Header {
|
|
int item;
|
|
|
|
// Index to the first node that holds the item.
|
|
std::size_t link;
|
|
};
|
|
|
|
// The transaction count with which this FP-Tree was created. This number to
|
|
// determine the support of a given itemset.
|
|
int _transactionCount;
|
|
|
|
// Nodes are stored in a vector and point to each other by using indexes into
|
|
// the same vector.
|
|
std::vector<Node> nodes;
|
|
|
|
// A header table that is used to find nodes that hold a specific item.
|
|
std::vector<Header> headerTable;
|
|
|
|
// Returns the header that contains the given item. If a header with the given
|
|
// item was not found a new header for this item is created and returned.
|
|
Header &header(int item) {
|
|
for (auto &header : this->headerTable) {
|
|
if (header.item == item) {
|
|
return header;
|
|
}
|
|
}
|
|
this->headerTable.push_back({.item = item});
|
|
return this->headerTable[this->headerTable.size() - 1];
|
|
}
|
|
|
|
// Returns handle of the root node.
|
|
std::size_t root() { return 0; }
|
|
|
|
// Returns handle of the child node with the given item.
|
|
std::optional<std::size_t> findChild(std::size_t node, int item) {
|
|
assert(node < this->nodes.size());
|
|
for (std::size_t child : this->nodes[node].children) {
|
|
if (this->nodes[child].item == item) {
|
|
return child;
|
|
}
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
// Adds the given count to the given node.
|
|
void addCount(std::size_t node, int count) {
|
|
assert(node < this->nodes.size());
|
|
this->nodes[node].count += count;
|
|
}
|
|
|
|
// Creates a new child with the given item and count and returns its handle.
|
|
std::size_t createChild(std::size_t node, int item, int count) {
|
|
assert(node < this->nodes.size());
|
|
// Create a new node.
|
|
std::size_t child = this->nodes.size();
|
|
Node childNode;
|
|
childNode.item = item;
|
|
childNode.count = count;
|
|
childNode.parent = node;
|
|
this->nodes.push_back(childNode);
|
|
// this->nodes.push_back({.item = item, .count = count, .parent = node});
|
|
|
|
// Update the header table.
|
|
Header &header = this->header(item);
|
|
this->nodes[child].link = header.link;
|
|
header.link = child;
|
|
|
|
// Append the child on the given node.
|
|
this->nodes[node].children.push_back(child);
|
|
|
|
return child;
|
|
}
|
|
|
|
// Returns the number of entries in the header table.
|
|
std::size_t headerTableSize() { return this->headerTable.size(); }
|
|
|
|
// Looks up the link for the given item in the header table.
|
|
std::size_t headerLinkByItem(int item) {
|
|
std::size_t link = this->header(item).link;
|
|
assert(link != 0);
|
|
return link;
|
|
}
|
|
|
|
// Returns the item of the entry in the header table with the given index.
|
|
int headerItem(std::size_t index) {
|
|
assert(index < this->headerTable.size());
|
|
return this->headerTable[index].item;
|
|
}
|
|
|
|
// Returns the link handle of the entry in the header table with the given
|
|
// index.
|
|
std::size_t headerLink(std::size_t index) {
|
|
assert(index < this->headerTable.size());
|
|
return this->headerTable[index].link;
|
|
}
|
|
|
|
// Returns the item of the given node.
|
|
int nodeItem(std::size_t node) {
|
|
assert(node < this->nodes.size());
|
|
return this->nodes[node].item;
|
|
}
|
|
|
|
// Returns the count of the given node.
|
|
int nodeCount(std::size_t node) {
|
|
assert(node < this->nodes.size());
|
|
return this->nodes[node].count;
|
|
}
|
|
|
|
// Returns the link handle of the given node.
|
|
std::optional<std::size_t> nodeLink(std::size_t node) {
|
|
assert(node < this->nodes.size());
|
|
return this->nodes[node].link == 0
|
|
? std::nullopt
|
|
: std::make_optional(this->nodes[node].link);
|
|
}
|
|
|
|
// Returns the parent handle of the given node.
|
|
std::optional<std::size_t> nodeParent(std::size_t node) {
|
|
assert(node < this->nodes.size());
|
|
if (node == 0) {
|
|
return std::nullopt;
|
|
} else {
|
|
assert(node < this->nodes.size());
|
|
return this->nodes[node].parent == 0
|
|
? std::nullopt
|
|
: std::make_optional(this->nodes[node].parent);
|
|
}
|
|
}
|
|
|
|
// Returns the child handles of the given node.
|
|
std::vector<std::size_t> nodeChildren(std::size_t node) {
|
|
assert(node < this->nodes.size());
|
|
return this->nodes[node].children;
|
|
}
|
|
|
|
// FPTreeImpl needs access to the private FP-Tree access/manipulation methods:
|
|
// root, addCount, createChild, etc.
|
|
friend class FPTreeImpl<FPTreeInMemory, std::size_t>;
|
|
};
|
|
|
|
// Finds all frequent itemsets that satisfy the support given by minSupport.
|
|
// The itemset of a transaction is extracted from each tuple of the relation
|
|
// by an index given by itemsetAttr.
|
|
fpGrowthLI::fpGrowthLI(GenericRelation *relation, int minSupport,
|
|
int itemsetAttr, int deoptimize) {
|
|
int transactionCount = relation->GetNoTuples();
|
|
|
|
std::vector<int> frequentItems;
|
|
|
|
// Scan the database to find all frequent items.
|
|
{
|
|
// Mapping from an item to its count.
|
|
std::unordered_map<int, int> counts;
|
|
|
|
// Database scan.
|
|
std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
|
|
Tuple *t;
|
|
while ((t = rit->GetNextTuple()) != nullptr) {
|
|
auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
|
|
for (std::size_t i = 0; i < transaction->getSize(); i += 1) {
|
|
counts[transaction->get(i)] += 1;
|
|
}
|
|
t->DeleteIfAllowed();
|
|
}
|
|
|
|
// Find the frequent items and sort them descendingly by their support count
|
|
// to reduce the size of the FP-Tree by using the most common prefixes.
|
|
std::vector<std::pair<int, int>> frequentItemSupportPairs;
|
|
for (auto const &[item, count] : counts) {
|
|
if (count >= minSupport) {
|
|
frequentItemSupportPairs.emplace_back(item, count);
|
|
}
|
|
}
|
|
std::sort(frequentItemSupportPairs.begin(), frequentItemSupportPairs.end(),
|
|
[](auto &a, auto &b) -> bool { return a.second > b.second; });
|
|
|
|
frequentItems.reserve(frequentItemSupportPairs.size());
|
|
for (const auto supportPair : frequentItemSupportPairs) {
|
|
frequentItems.push_back(supportPair.first);
|
|
}
|
|
}
|
|
|
|
// Scan database to create the FP-Tree and mine the frequent itemsets.
|
|
{
|
|
FPTreeInMemory fpTree(transactionCount);
|
|
|
|
// Database scan.
|
|
std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
|
|
Tuple *t;
|
|
while ((t = rit->GetNextTuple()) != nullptr) {
|
|
// Create an itemset out of the frequent items that
|
|
auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
|
|
std::vector<int> itemset;
|
|
for (int item : frequentItems) {
|
|
if (transaction->contains(item)) {
|
|
itemset.push_back(item);
|
|
}
|
|
}
|
|
fpTree.insert(itemset);
|
|
t->DeleteIfAllowed();
|
|
}
|
|
|
|
fpTree.mine(this->frequentItemsets, minSupport, deoptimize);
|
|
}
|
|
|
|
// Setup iterator for the result stream.
|
|
this->it = this->frequentItemsets.cbegin();
|
|
|
|
// Setup resulting tuple type.
|
|
this->tupleType = new TupleType(
|
|
SecondoSystem::GetCatalog()->NumericType(frequentItemsetTupleType()));
|
|
}
|
|
|
|
// Returns the next frequent itemset as a tuple.
|
|
Tuple *fpGrowthLI::getNext() {
|
|
if (this->it != this->frequentItemsets.cend()) {
|
|
auto &[itemset, support] = *this->it;
|
|
auto tuple = new Tuple(this->tupleType);
|
|
tuple->PutAttribute(0, new collection::IntSet(std::set<int>(
|
|
itemset.cbegin(), itemset.cend())));
|
|
tuple->PutAttribute(1, new CcReal(support));
|
|
this->it++;
|
|
return tuple;
|
|
} else {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
/*
|
|
FPTree data type implementation
|
|
|
|
*/
|
|
|
|
FPTreeT::FPTreeT()
|
|
: nodeFile(SmiKey::KeyDataType::Integer, sizeof(Node), false),
|
|
nextNodeId(0),
|
|
headerFile(SmiKey::KeyDataType::Integer, sizeof(Header), false),
|
|
nextHeaderId(0), _transactionCount(0) {
|
|
this->nodeFile.Create();
|
|
this->headerFile.Create();
|
|
Node::create(this->nodeFile, this->nextNodeId, {});
|
|
}
|
|
|
|
FPTreeT::FPTreeT(SmiFileId nodeFileId, SmiRecordId nextNodeId,
|
|
SmiFileId headerFileId, SmiRecordId nextHeaderId,
|
|
int transactionCount)
|
|
: nodeFile(SmiKey::KeyDataType::Integer, sizeof(Node), false),
|
|
nextNodeId(nextNodeId),
|
|
headerFile(SmiKey::KeyDataType::Integer, sizeof(Header), false),
|
|
nextHeaderId(nextHeaderId), _transactionCount(transactionCount) {
|
|
this->nodeFile.Open(nodeFileId);
|
|
this->headerFile.Open(headerFileId);
|
|
}
|
|
|
|
FPTreeT::FPTreeT(int transactionCount)
|
|
: nodeFile(SmiKey::KeyDataType::Integer, sizeof(Node), true), nextNodeId(0),
|
|
headerFile(SmiKey::KeyDataType::Integer, sizeof(Header), true),
|
|
nextHeaderId(0), _transactionCount(transactionCount) {
|
|
this->nodeFile.Create();
|
|
this->headerFile.Create();
|
|
Node::create(this->nodeFile, this->nextNodeId, {});
|
|
}
|
|
|
|
std::string FPTreeT::BasicType() { return "fptree"; }
|
|
|
|
ListExpr FPTreeT::Out(ListExpr typeInfo, Word w) {
|
|
auto fpTree = (FPTreeT *)w.addr;
|
|
// Serialize headers.
|
|
NList headers;
|
|
for (SmiRecordId id = 0; id < fpTree->nextHeaderId; id += 1) {
|
|
Header header = Header::read(fpTree->headerFile, id);
|
|
headers.append(NList(
|
|
NList().intAtom(header.item), NList().intAtom((int)header.link),
|
|
NList().intAtom((int)header.left), NList().intAtom((int)header.right)));
|
|
}
|
|
// Serialize nodes.
|
|
NList nodes;
|
|
for (SmiRecordId id = 0; id < fpTree->nextNodeId; id += 1) {
|
|
Node node = Node::read(fpTree->nodeFile, id);
|
|
NList nodeRepr;
|
|
nodeRepr.append(NList().intAtom(node.item));
|
|
nodeRepr.append(NList().intAtom(node.count));
|
|
nodeRepr.append(NList().intAtom((int)node.child));
|
|
nodeRepr.append(NList().intAtom((int)node.left));
|
|
nodeRepr.append(NList().intAtom((int)node.right));
|
|
nodeRepr.append(NList().intAtom((int)node.parent));
|
|
nodeRepr.append(NList().intAtom((int)node.link));
|
|
nodes.append(nodeRepr);
|
|
}
|
|
return NList(NList().intAtom(fpTree->_transactionCount), headers, nodes)
|
|
.listExpr();
|
|
}
|
|
|
|
Word FPTreeT::In(const ListExpr typeInfo, const ListExpr instance,
|
|
const int errorPos, ListExpr &errorInfo, bool &correct) {
|
|
NList in(instance);
|
|
correct = false;
|
|
if (in.isList() && in.length() == 3) {
|
|
// Unserialize transactionCount.
|
|
if (!in.first().isInt()) {
|
|
return nullptr;
|
|
}
|
|
int transactionCount = in.first().intval();
|
|
|
|
// Unserialize headers.
|
|
SmiHashFile headerFile(SmiKey::KeyDataType::Integer, true, false);
|
|
SmiRecordId nextHeaderId = 0;
|
|
headerFile.Create();
|
|
if (in.second().isList() && !in.second().isEmpty()) {
|
|
NList headers = in.second();
|
|
for (std::size_t i = 1; i <= headers.length(); i += 1) {
|
|
if (headers.elem(i).isList() && headers.elem(i).length() == 4 &&
|
|
headers.elem(i).first().isInt() &&
|
|
headers.elem(i).second().isInt() &&
|
|
headers.elem(i).third().isInt() &&
|
|
headers.elem(i).fourth().isInt()) {
|
|
Header::create(
|
|
headerFile, nextHeaderId,
|
|
Header{.item = headers.elem(i).first().intval(),
|
|
.link = (SmiRecordId)headers.elem(i).second().intval(),
|
|
.left = (SmiRecordId)headers.elem(i).third().intval(),
|
|
.right = (SmiRecordId)headers.elem(i).fourth().intval()});
|
|
} else {
|
|
headerFile.Close();
|
|
headerFile.Remove();
|
|
return nullptr;
|
|
}
|
|
}
|
|
} else {
|
|
headerFile.Close();
|
|
headerFile.Remove();
|
|
return nullptr;
|
|
}
|
|
headerFile.Close();
|
|
// Unserialize nodes.
|
|
SmiHashFile nodeFile(SmiKey::KeyDataType::Integer, true, false);
|
|
SmiRecordId nextNodeId = 0;
|
|
nodeFile.Create();
|
|
if (in.third().isList() && !in.third().isEmpty()) {
|
|
NList nodes = in.third();
|
|
for (std::size_t i = 1; i <= nodes.length(); i += 1) {
|
|
if (nodes.elem(i).isList() && nodes.elem(i).length() == 7 &&
|
|
nodes.elem(i).first().isInt() && nodes.elem(i).second().isInt() &&
|
|
nodes.elem(i).third().isInt() && nodes.elem(i).fourth().isInt() &&
|
|
nodes.elem(i).fifth().isInt() && nodes.elem(i).sixth().isInt() &&
|
|
nodes.elem(i).seventh().isInt()) {
|
|
Node::create(
|
|
nodeFile, nextNodeId,
|
|
Node{.item = nodes.elem(i).first().intval(),
|
|
.count = nodes.elem(i).second().intval(),
|
|
.child = (SmiRecordId)nodes.elem(i).third().intval(),
|
|
.left = (SmiRecordId)nodes.elem(i).fourth().intval(),
|
|
.right = (SmiRecordId)nodes.elem(i).fifth().intval(),
|
|
.parent = (SmiRecordId)nodes.elem(i).sixth().intval(),
|
|
.link = (SmiRecordId)nodes.elem(i).seventh().intval()});
|
|
} else {
|
|
nodeFile.Close();
|
|
nodeFile.Remove();
|
|
return nullptr;
|
|
}
|
|
}
|
|
} else {
|
|
nodeFile.Close();
|
|
nodeFile.Remove();
|
|
return nullptr;
|
|
}
|
|
nodeFile.Close();
|
|
// Create FP-tree.
|
|
correct = true;
|
|
return new FPTreeT(nodeFile.GetFileId(), nextNodeId, headerFile.GetFileId(),
|
|
nextHeaderId, transactionCount);
|
|
} else {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
Word FPTreeT::Create(const ListExpr typeInfo) { return new FPTreeT(); }
|
|
|
|
void FPTreeT::Delete(const ListExpr typeInfo, Word &w) {
|
|
auto fpTree = (FPTreeT *)w.addr;
|
|
if (fpTree->nodeFile.IsOpen()) {
|
|
fpTree->nodeFile.Close();
|
|
}
|
|
fpTree->nodeFile.Drop();
|
|
if (fpTree->headerFile.IsOpen()) {
|
|
fpTree->headerFile.Close();
|
|
}
|
|
fpTree->headerFile.Drop();
|
|
delete fpTree;
|
|
w.addr = nullptr;
|
|
}
|
|
|
|
bool FPTreeT::Open(SmiRecord &valueRecord, std::size_t &offset,
|
|
const ListExpr typeInfo, Word &value) {
|
|
// Read nodeFileId.
|
|
SmiFileId nodeFileId;
|
|
if (valueRecord.Read(&nodeFileId, sizeof(nodeFileId), offset) !=
|
|
sizeof(nodeFileId)) {
|
|
return false;
|
|
}
|
|
offset += sizeof(nodeFileId);
|
|
|
|
// Read nextNodeId.
|
|
int nextNodeId;
|
|
if (valueRecord.Read(&nextNodeId, sizeof(nextNodeId), offset) !=
|
|
sizeof(nextNodeId)) {
|
|
return false;
|
|
}
|
|
offset += sizeof(nextNodeId);
|
|
|
|
// Read headerFileId.
|
|
SmiFileId headerFileId;
|
|
if (valueRecord.Read(&headerFileId, sizeof(headerFileId), offset) !=
|
|
sizeof(headerFileId)) {
|
|
return false;
|
|
}
|
|
offset += sizeof(headerFileId);
|
|
|
|
// Read nextHeaderId.
|
|
int nextHeaderId;
|
|
if (valueRecord.Read(&nextHeaderId, sizeof(nextHeaderId), offset) !=
|
|
sizeof(nextHeaderId)) {
|
|
return false;
|
|
}
|
|
offset += sizeof(nextHeaderId);
|
|
|
|
// Read transactionCount.
|
|
int transactionCount;
|
|
if (valueRecord.Read(&transactionCount, sizeof(transactionCount), offset) !=
|
|
sizeof(transactionCount)) {
|
|
return false;
|
|
}
|
|
offset += sizeof(transactionCount);
|
|
|
|
value.addr = new FPTreeT(nodeFileId, nextNodeId, headerFileId, nextHeaderId,
|
|
transactionCount);
|
|
return true;
|
|
}
|
|
|
|
bool FPTreeT::Save(SmiRecord &valueRecord, std::size_t &offset,
|
|
const ListExpr typeInfo, Word &w) {
|
|
offset = 0;
|
|
auto fpTree = (FPTreeT *)w.addr;
|
|
|
|
// Write nodeFileId.
|
|
SmiFileId nodeFileId = fpTree->nodeFile.GetFileId();
|
|
if (valueRecord.Write(&nodeFileId, sizeof(nodeFileId), offset) !=
|
|
sizeof(nodeFileId)) {
|
|
return false;
|
|
}
|
|
offset += sizeof(nodeFileId);
|
|
|
|
// Write nextNodeId.
|
|
if (valueRecord.Write(&fpTree->nextNodeId, sizeof(fpTree->nextNodeId),
|
|
offset) != sizeof(fpTree->nextNodeId)) {
|
|
return false;
|
|
}
|
|
offset += sizeof(fpTree->nextNodeId);
|
|
|
|
// Write headerFileId.
|
|
SmiFileId headerFileId = fpTree->headerFile.GetFileId();
|
|
if (valueRecord.Write(&headerFileId, sizeof(headerFileId), offset) !=
|
|
sizeof(headerFileId)) {
|
|
return false;
|
|
}
|
|
offset += sizeof(headerFileId);
|
|
|
|
// Write nextHeaderId.
|
|
if (valueRecord.Write(&fpTree->nextHeaderId, sizeof(fpTree->nextHeaderId),
|
|
offset) != sizeof(fpTree->nextHeaderId)) {
|
|
return false;
|
|
}
|
|
offset += sizeof(fpTree->nextHeaderId);
|
|
|
|
// Write transactionCount.
|
|
if (valueRecord.Write(&fpTree->_transactionCount,
|
|
sizeof(fpTree->_transactionCount),
|
|
offset) != sizeof(fpTree->_transactionCount)) {
|
|
return false;
|
|
}
|
|
offset += sizeof(fpTree->_transactionCount);
|
|
|
|
return true;
|
|
}
|
|
|
|
void FPTreeT::Close(const ListExpr typeInfo, Word &w) {
|
|
delete (FPTreeT *)w.addr;
|
|
w.addr = nullptr;
|
|
}
|
|
|
|
Word FPTreeT::Clone(const ListExpr typeInfo, const Word &w) {
|
|
Word result;
|
|
auto source = (FPTreeT *)w.addr;
|
|
// Copy nodes.
|
|
SmiHashFile nodeFile(SmiKey::KeyDataType::Integer, true, false);
|
|
nodeFile.Create();
|
|
SmiRecordId nextNodeId = 0;
|
|
for (SmiRecordId i = 0; i < source->nextNodeId; i += 1) {
|
|
Node::create(nodeFile, nextNodeId, Node::read(source->nodeFile, i));
|
|
}
|
|
nodeFile.Close();
|
|
// Copy headers.
|
|
SmiHashFile headerFile(SmiKey::KeyDataType::Integer, true, false);
|
|
headerFile.Create();
|
|
SmiRecordId nextHeaderId = 0;
|
|
for (SmiRecordId i = 0; i < source->nextHeaderId; i += 1) {
|
|
Header::create(headerFile, nextHeaderId,
|
|
Header::read(source->headerFile, i));
|
|
}
|
|
headerFile.Close();
|
|
// Create FP-tree clone.
|
|
return new FPTreeT(nodeFile.GetFileId(), nextNodeId, headerFile.GetFileId(),
|
|
nextHeaderId, source->_transactionCount);
|
|
}
|
|
|
|
void *FPTreeT::Cast(void *addr) { return (new (addr) FPTreeT); }
|
|
|
|
int FPTreeT::SizeOf() { return sizeof(FPTreeT); }
|
|
|
|
bool FPTreeT::KindCheck(ListExpr type, ListExpr &errorInfo) {
|
|
return listutils::isSymbol(type, BasicType());
|
|
}
|
|
|
|
void FPTreeT::reset(int transactionCount) {
|
|
if (this->nodeFile.IsOpen()) {
|
|
this->nodeFile.Truncate();
|
|
} else {
|
|
this->nodeFile.Create();
|
|
}
|
|
this->nextNodeId = 0;
|
|
if (this->headerFile.IsOpen()) {
|
|
this->headerFile.Truncate();
|
|
} else {
|
|
this->headerFile.Create();
|
|
}
|
|
this->nextHeaderId = 0;
|
|
this->_transactionCount = transactionCount;
|
|
Node::create(this->nodeFile, this->nextNodeId, {});
|
|
}
|
|
|
|
void FPTreeT::Header::write(SmiRecord &record) const {
|
|
size_t offset = 0;
|
|
record.Write(&this->item, sizeof(this->item), offset);
|
|
offset += sizeof(this->item);
|
|
record.Write(&this->link, sizeof(this->link), offset);
|
|
offset += sizeof(this->link);
|
|
record.Write(&this->left, sizeof(this->left), offset);
|
|
offset += sizeof(this->left);
|
|
record.Write(&this->right, sizeof(this->right), offset);
|
|
offset += sizeof(this->right);
|
|
}
|
|
|
|
FPTreeT::Header FPTreeT::Header::read(SmiHashFile &file, SmiRecordId id) {
|
|
Header header{};
|
|
SmiRecord record;
|
|
file.SelectRecord(id, record);
|
|
size_t offset = 0;
|
|
record.Read(&header.item, sizeof(header.item), offset);
|
|
offset += sizeof(header.item);
|
|
record.Read(&header.link, sizeof(header.link), offset);
|
|
offset += sizeof(header.link);
|
|
record.Read(&header.left, sizeof(header.left), offset);
|
|
offset += sizeof(header.left);
|
|
record.Read(&header.right, sizeof(header.right), offset);
|
|
offset += sizeof(header.right);
|
|
record.Finish();
|
|
return header;
|
|
}
|
|
|
|
void FPTreeT::Header::write(SmiHashFile &file, SmiRecordId id,
|
|
const Header &header) {
|
|
SmiRecord record;
|
|
file.SelectRecord(id, record, SmiFile::Update);
|
|
header.write(record);
|
|
record.Finish();
|
|
}
|
|
|
|
SmiRecordId FPTreeT::Header::create(SmiHashFile &file, SmiRecordId &nextId,
|
|
const Header &header) {
|
|
SmiRecordId id = nextId;
|
|
nextId += 1;
|
|
SmiRecord record;
|
|
file.InsertRecord(id, record);
|
|
header.write(record);
|
|
record.Finish();
|
|
return id;
|
|
}
|
|
|
|
void FPTreeT::Node::write(SmiRecord &record) const {
|
|
size_t offset = 0;
|
|
record.Write(&this->item, sizeof(this->item), offset);
|
|
offset += sizeof(this->item);
|
|
record.Write(&this->count, sizeof(this->count), offset);
|
|
offset += sizeof(this->count);
|
|
record.Write(&this->child, sizeof(this->child), offset);
|
|
offset += sizeof(this->child);
|
|
record.Write(&this->left, sizeof(this->left), offset);
|
|
offset += sizeof(this->left);
|
|
record.Write(&this->right, sizeof(this->right), offset);
|
|
offset += sizeof(this->right);
|
|
record.Write(&this->parent, sizeof(this->parent), offset);
|
|
offset += sizeof(this->parent);
|
|
record.Write(&this->link, sizeof(this->link), offset);
|
|
offset += sizeof(this->link);
|
|
}
|
|
|
|
FPTreeT::Node FPTreeT::Node::read(SmiHashFile &file, SmiRecordId id) {
|
|
Node node{};
|
|
SmiRecord record;
|
|
file.SelectRecord(id, record);
|
|
size_t offset = 0;
|
|
record.Read(&node.item, sizeof(node.item), offset);
|
|
offset += sizeof(node.item);
|
|
record.Read(&node.count, sizeof(node.count), offset);
|
|
offset += sizeof(node.count);
|
|
record.Read(&node.child, sizeof(node.child), offset);
|
|
offset += sizeof(node.child);
|
|
record.Read(&node.left, sizeof(node.left), offset);
|
|
offset += sizeof(node.left);
|
|
record.Read(&node.right, sizeof(node.right), offset);
|
|
offset += sizeof(node.right);
|
|
record.Read(&node.parent, sizeof(node.parent), offset);
|
|
offset += sizeof(node.parent);
|
|
record.Read(&node.link, sizeof(node.link), offset);
|
|
offset += sizeof(node.link);
|
|
record.Finish();
|
|
return node;
|
|
}
|
|
|
|
void FPTreeT::Node::write(SmiHashFile &file, SmiRecordId id, const Node &node) {
|
|
SmiRecord record;
|
|
file.SelectRecord(id, record, SmiFile::Update);
|
|
node.write(record);
|
|
record.Finish();
|
|
}
|
|
|
|
SmiRecordId FPTreeT::Node::create(SmiHashFile &file, SmiRecordId &nextId,
|
|
const Node &node) {
|
|
SmiRecord record;
|
|
SmiRecordId id = nextId;
|
|
nextId += 1;
|
|
file.InsertRecord(id, record);
|
|
node.write(record);
|
|
record.Finish();
|
|
return id;
|
|
}
|
|
|
|
// Returns handle of the root node.
|
|
SmiRecordId FPTreeT::root() { return 0; }
|
|
|
|
// Returns handle of the root header.
|
|
SmiRecordId FPTreeT::headerRoot() { return 0; }
|
|
|
|
// Returns handle of the child node with the given item.
|
|
std::optional<SmiRecordId> FPTreeT::findChild(SmiRecordId nodeId, int item) {
|
|
Node node = Node::read(this->nodeFile, nodeId);
|
|
if (node.child == 0) {
|
|
return std::nullopt;
|
|
} else {
|
|
SmiRecordId ignore;
|
|
return binaryFind<Node>(this->nodeFile, node.child, item, ignore);
|
|
}
|
|
}
|
|
|
|
// Adds the given count to the given node.
|
|
void FPTreeT::addCount(SmiRecordId nodeId, int count) {
|
|
Node node = Node::read(this->nodeFile, nodeId);
|
|
node.count += count;
|
|
Node::write(this->nodeFile, nodeId, node);
|
|
}
|
|
|
|
// Creates a new child with the given item and count and returns its handle.
|
|
SmiRecordId FPTreeT::createChild(SmiRecordId nodeId, int item, int count) {
|
|
// Create child node.
|
|
Node child;
|
|
child.item = item;
|
|
child.count = count;
|
|
child.parent = nodeId;
|
|
|
|
// Node child{.item = item, .count = count, .parent = nodeId};
|
|
SmiRecordId childId = Node::create(this->nodeFile, this->nextNodeId, child);
|
|
|
|
// Find the entry for the given item in the header table.
|
|
if (this->nextHeaderId == 0) {
|
|
Header::create(this->headerFile, this->nextHeaderId,
|
|
{.item = item, .link = childId});
|
|
} else {
|
|
SmiRecordId lastVisitedHeaderId = 0;
|
|
std::optional<SmiRecordId> headerId = binaryFind<Header>(
|
|
this->headerFile, this->headerRoot(), item, lastVisitedHeaderId);
|
|
if (headerId) {
|
|
// Header entry for the given item already exists -> update the link to
|
|
// the new child node.
|
|
Header header = Header::read(this->headerFile, *headerId);
|
|
child.link = header.link;
|
|
Node::write(this->nodeFile, childId, child);
|
|
header.link = childId;
|
|
Header::write(this->headerFile, *headerId, header);
|
|
} else {
|
|
// Header entry for the given item does not exist yet -> create a new
|
|
// entry.
|
|
binaryInsert<Header>(this->headerFile, lastVisitedHeaderId, item,
|
|
Header::create(this->headerFile, this->nextHeaderId,
|
|
{.item = item, .link = childId}));
|
|
}
|
|
}
|
|
|
|
// Append the child node on the given node.
|
|
Node node = Node::read(this->nodeFile, nodeId);
|
|
if (node.child == 0) {
|
|
node.child = childId;
|
|
Node::write(this->nodeFile, nodeId, node);
|
|
} else {
|
|
binaryInsert<Node>(this->nodeFile, node.child, item, childId);
|
|
}
|
|
|
|
return childId;
|
|
}
|
|
|
|
// Returns the number of entries in the header table.
|
|
std::size_t FPTreeT::headerTableSize() {
|
|
return (std::size_t)this->nextHeaderId;
|
|
}
|
|
|
|
// Looks up the link for the given item in the header table.
|
|
SmiRecordId FPTreeT::headerLinkByItem(int item) {
|
|
SmiRecordId ignore = 0;
|
|
std::optional<SmiRecordId> headerId =
|
|
binaryFind<Header>(this->headerFile, this->headerRoot(), item, ignore);
|
|
if (headerId) {
|
|
return Header::read(this->headerFile, *headerId).link;
|
|
}
|
|
assert(false);
|
|
}
|
|
|
|
// Returns the item of the entry in the header table with the given handle.
|
|
int FPTreeT::headerItem(SmiRecordId id) {
|
|
return Header::read(this->headerFile, id).item;
|
|
}
|
|
|
|
// Returns the link handle of the entry in the header table with the given
|
|
// index.
|
|
SmiRecordId FPTreeT::headerLink(SmiRecordId id) {
|
|
return Header::read(this->headerFile, id).link;
|
|
}
|
|
|
|
// Returns the item of the given node.
|
|
int FPTreeT::nodeItem(SmiRecordId id) {
|
|
return Node::read(this->nodeFile, id).item;
|
|
}
|
|
|
|
// Returns the count of the given node.
|
|
int FPTreeT::nodeCount(SmiRecordId id) {
|
|
return Node::read(this->nodeFile, id).count;
|
|
}
|
|
|
|
// Returns the link handle of the given node.
|
|
std::optional<SmiRecordId> FPTreeT::nodeLink(SmiRecordId id) {
|
|
Node node = Node::read(this->nodeFile, id);
|
|
return node.link == 0 ? std::nullopt : std::make_optional(node.link);
|
|
}
|
|
|
|
// Returns the parent handle of the given node.
|
|
std::optional<SmiRecordId> FPTreeT::nodeParent(SmiRecordId id) {
|
|
Node node = Node::read(this->nodeFile, id);
|
|
if (node.parent == this->root()) {
|
|
return std::nullopt;
|
|
} else {
|
|
return std::make_optional(node.parent);
|
|
}
|
|
}
|
|
|
|
// Returns the child handles of the given node.
|
|
std::vector<SmiRecordId> FPTreeT::nodeChildren(SmiRecordId nodeId) {
|
|
std::vector<SmiRecordId> ids;
|
|
Node node = Node::read(this->nodeFile, nodeId);
|
|
if (node.child != 0) {
|
|
std::vector<SmiRecordId> visit = {node.child};
|
|
while (!visit.empty()) {
|
|
SmiRecordId id = visit.back();
|
|
visit.pop_back();
|
|
ids.push_back(id);
|
|
Node child = Node::read(this->nodeFile, id);
|
|
if (child.left != 0) {
|
|
visit.push_back(child.left);
|
|
}
|
|
if (child.right != 0) {
|
|
visit.push_back(child.right);
|
|
}
|
|
}
|
|
}
|
|
return ids;
|
|
}
|
|
|
|
struct fptreeInfo : ConstructorInfo {
|
|
fptreeInfo() : ConstructorInfo() {
|
|
this->name = FPTreeT::BasicType();
|
|
this->signature = "-> " + Kind::SIMPLE();
|
|
this->typeExample = FPTreeT::BasicType();
|
|
this->listRep = "(((<item> <link>)*) ((<item> <count> <child> "
|
|
"<nextChild> <parent> <link>)*))";
|
|
this->valueExample =
|
|
"(((1 1) (2 2)) ((0 0 1 0 0 0) (1 3 2 0 0 0) (2 3 0 0 0 0)))";
|
|
this->remarks =
|
|
"The first list represents the header table of the FP-Tree. The second "
|
|
"list are the nodes of the FP-Tree. All values are integers.";
|
|
}
|
|
};
|
|
|
|
struct fptreeFunctions : ConstructorFunctions<FPTreeT> {
|
|
fptreeFunctions() : ConstructorFunctions<FPTreeT>() {
|
|
this->in = FPTreeT::In;
|
|
this->out = FPTreeT::Out;
|
|
this->create = FPTreeT::Create;
|
|
this->deletion = FPTreeT::Delete;
|
|
this->open = FPTreeT::Open;
|
|
this->save = FPTreeT::Save;
|
|
this->close = FPTreeT::Close;
|
|
this->clone = FPTreeT::Clone;
|
|
this->cast = FPTreeT::Cast;
|
|
this->sizeOf = FPTreeT::SizeOf;
|
|
this->kindCheck = FPTreeT::KindCheck;
|
|
}
|
|
};
|
|
|
|
TypeConstructor fptreeTC = TypeConstructor(fptreeInfo(), fptreeFunctions());
|
|
|
|
ListExpr createFpTreeTM(ListExpr args) {
|
|
return mineTM(args, NList().symbolAtom(FPTreeT::BasicType()).listExpr());
|
|
}
|
|
|
|
int createFpTreeVM(Word *args, Word &result, int message, Word &local,
|
|
Supplier s) {
|
|
auto relation = (GenericRelation *)args[0].addr;
|
|
bool relativeSupport = ((CcBool *)args[4].addr)->GetBoolval();
|
|
int minSupport = 0;
|
|
int transactionCount = relation->GetNoTuples();
|
|
if (relativeSupport) {
|
|
double support = ((CcReal *)args[2].addr)->GetRealval();
|
|
minSupport = (int)(std::ceil(support * (double)transactionCount));
|
|
} else {
|
|
minSupport = ((CcInt *)args[2].addr)->GetIntval();
|
|
}
|
|
int itemsetAttr = ((CcInt *)args[4].addr)->GetIntval();
|
|
|
|
result = qp->ResultStorage(s);
|
|
auto fpTree = (FPTreeT *)result.addr;
|
|
fpTree->reset(transactionCount);
|
|
|
|
std::vector<int> frequentItems;
|
|
|
|
// Scan the database to find all frequent items.
|
|
{
|
|
// Mapping from an item to its count.
|
|
std::unordered_map<int, int> counts;
|
|
|
|
// Database scan.
|
|
std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
|
|
Tuple *t;
|
|
while ((t = rit->GetNextTuple()) != nullptr) {
|
|
auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
|
|
for (std::size_t i = 0; i < transaction->getSize(); i += 1) {
|
|
counts[transaction->get(i)] += 1;
|
|
}
|
|
t->DeleteIfAllowed();
|
|
}
|
|
|
|
// Find the frequent items and sort them descendingly by their support count
|
|
// to reduce the size of the FP-Tree by using the most common prefixes.
|
|
std::vector<std::pair<int, int>> frequentItemSupportPairs;
|
|
for (auto const &[item, count] : counts) {
|
|
if (count >= minSupport) {
|
|
frequentItemSupportPairs.emplace_back(item, count);
|
|
}
|
|
}
|
|
std::sort(frequentItemSupportPairs.begin(), frequentItemSupportPairs.end(),
|
|
[](auto &a, auto &b) -> bool { return a.second > b.second; });
|
|
|
|
frequentItems.reserve(frequentItemSupportPairs.size());
|
|
for (const auto supportPair : frequentItemSupportPairs) {
|
|
frequentItems.push_back(supportPair.first);
|
|
}
|
|
}
|
|
|
|
// Scan database to create the FP-Tree and mine the frequent itemsets.
|
|
{
|
|
// Database scan.
|
|
std::unique_ptr<GenericRelationIterator> rit(relation->MakeScan());
|
|
Tuple *t;
|
|
while ((t = rit->GetNextTuple()) != nullptr) {
|
|
// Create an itemset out of the frequent items that
|
|
auto transaction = (collection::IntSet *)t->GetAttribute(itemsetAttr);
|
|
std::vector<int> itemset;
|
|
for (int item : frequentItems) {
|
|
if (transaction->contains(item)) {
|
|
itemset.push_back(item);
|
|
}
|
|
}
|
|
fpTree->insert(itemset);
|
|
t->DeleteIfAllowed();
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
ListExpr mineFpTreeTM(ListExpr args) {
|
|
NList type(args);
|
|
|
|
NList appendList;
|
|
bool relativeSupport = false;
|
|
NList attrs;
|
|
if (type.length() == 2 || type.length() == 3) {
|
|
if (!type.elem(1).first().isSymbol(FPTreeT::BasicType())) {
|
|
return NList::typeError("Argument number 1 must be of type fptree.");
|
|
}
|
|
if (type.elem(2).first().isSymbol(CcInt::BasicType())) {
|
|
if (type.elem(2).second().intval() <= 0) {
|
|
return NList::typeError("Argument number 2 must be of type int and > 0 "
|
|
"or of type real and in the interval (0, 1).");
|
|
}
|
|
} else if (type.elem(2).first().isSymbol(CcReal::BasicType())) {
|
|
if (type.elem(2).second().realval() <= 0.0 ||
|
|
type.elem(2).second().realval() >= 1.0) {
|
|
return NList::typeError("Argument number 2 must be of type int and > 0 "
|
|
"or of type real and in the interval (0, 1).");
|
|
} else {
|
|
relativeSupport = true;
|
|
}
|
|
} else {
|
|
return NList::typeError("Argument number 2 must be of type int and > 0 "
|
|
"or of type real and in the interval (0, 1).");
|
|
}
|
|
if (type.length() == 3) {
|
|
if (!type.elem(3).first().isSymbol(CcInt::BasicType())) {
|
|
return NList::typeError(
|
|
"The optional argument number 3 must be of type int.");
|
|
}
|
|
} else {
|
|
// Add default value via the append-functionality.
|
|
appendList.append(NList().intAtom(0));
|
|
}
|
|
} else {
|
|
return NList::typeError("2 arguments expected but " +
|
|
std::to_string(type.length()) + " received.");
|
|
}
|
|
|
|
appendList.append(NList().boolAtom(relativeSupport));
|
|
|
|
NList tupleType = NList(frequentItemsetTupleType());
|
|
return NList(Symbols::APPEND(), appendList,
|
|
NList().streamOf(tupleType).listExpr())
|
|
.listExpr();
|
|
}
|
|
|
|
int mineFpTreeVM(Word *args, Word &result, int message, Word &local,
|
|
Supplier s) {
|
|
auto *li = (frequentItemsetStreamLI *)local.addr;
|
|
switch (message) {
|
|
case OPEN: {
|
|
delete li;
|
|
auto fpTree = (FPTreeT *)args[0].addr;
|
|
int deoptimize = ((CcInt *)args[2].addr)->GetIntval();
|
|
bool relativeSupport = ((CcBool *)args[3].addr)->GetBoolval();
|
|
int minSupport = 0;
|
|
if (relativeSupport) {
|
|
double support = ((CcReal *)args[1].addr)->GetRealval();
|
|
minSupport =
|
|
(int)(std::ceil(support * (double)fpTree->transactionCount()));
|
|
} else {
|
|
minSupport = ((CcInt *)args[1].addr)->GetIntval();
|
|
}
|
|
std::vector<std::pair<std::vector<int>, double>> frequentItemsets;
|
|
fpTree->mine(frequentItemsets, minSupport, deoptimize);
|
|
local.addr = new frequentItemsetStreamLI(std::move(frequentItemsets));
|
|
return 0;
|
|
}
|
|
case REQUEST:
|
|
result.addr = li ? li->getNext() : nullptr;
|
|
return result.addr ? YIELD : CANCEL;
|
|
case CLOSE:
|
|
delete li;
|
|
local.addr = nullptr;
|
|
return 0;
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
} // namespace AssociationAnalysis
|