Files
secondo/Algebras/AssociationAnalysis/ImportExport.cpp
2026-01-23 17:03:45 +08:00

367 lines
12 KiB
C++

/*
----
This file is part of SECONDO.
Copyright (C) 2021, University in Hagen, Department of Computer Science,
Database Systems for New Applications.
SECONDO is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
SECONDO is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SECONDO; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
----
//paragraph [1] Title: [{\Large \bf \begin {center}] [\end {center}}]
[1] Association Analysis Algebra Implementation
January 2021 - April 2021, P. Fedorow for bachelor thesis.
*/
#include "ImportExport.h"
#include "Common.h"
#include "Algebras/Collection/CollectionAlgebra.h"
#include "Algebras/Collection/IntSet.h"
#include "Algebras/FText/FTextAlgebra.h"
#include "StandardTypes.h"
#include <fstream>
#include <iostream>
#include <utility>
namespace AssociationAnalysis {
// Loads transactions from a csv.
loadTransactionsLI::loadTransactionsLI(std::string path) {
std::fstream file;
file.open(path, std::ios::in);
if (file.is_open()) {
std::string line;
int id = 0;
while (std::getline(file, line)) {
std::istringstream lineStream(line);
Transaction transaction;
std::string item;
while (std::getline(lineStream, item, ' ')) {
try {
transaction.itemset.insert(std::stoi(item));
} catch (const std::exception &e) {
continue;
}
}
if (!transaction.itemset.empty()) {
transaction.id = id;
id += 1;
this->transactions.push_back(transaction);
}
}
}
this->it = this->transactions.cbegin();
// Setup resulting tuple type.
this->tupleType = new TupleType(
SecondoSystem::GetCatalog()->NumericType(transactionsTupleType()));
}
// Returns the next transactions as a tuple.
Tuple *loadTransactionsLI::getNext() {
if (this->it != this->transactions.cend()) {
auto &transaction = *this->it;
auto tuple = new Tuple(this->tupleType);
tuple->PutAttribute(0, new CcInt(transaction.id));
tuple->PutAttribute(1, new collection::IntSet(transaction.itemset));
this->it++;
return tuple;
} else {
return nullptr;
}
}
// Type mapping for the loadTransactions operator.
ListExpr loadTransactionsTM(ListExpr args) {
NList type(args);
if (type.length() == 1) {
const NList &arg = type.elem(1);
if (!arg.first().isSymbol(FText::BasicType())) {
return NList::typeError("The path argument must be of type text.");
}
} else {
return NList::typeError("1 argument expected but " +
std::to_string(type.length()) + " received.");
}
NList tupleType = NList(transactionsTupleType());
return NList().streamOf(tupleType).listExpr();
}
// Value mapping for the loadTransactions operator.
int loadTransactionsVM(Word *args, Word &result, int message, Word &local,
Supplier s) {
auto *li = (loadTransactionsLI *)local.addr;
switch (message) {
case OPEN: {
delete li;
std::string path = ((FText *)args[0].addr)->GetValue();
local.addr = new loadTransactionsLI(path);
return 0;
}
case REQUEST:
result.addr = li ? li->getNext() : nullptr;
return result.addr ? YIELD : CANCEL;
case CLOSE:
delete li;
local.addr = nullptr;
return 0;
default:
return 0;
}
}
ListExpr saveTransactionsTM(ListExpr args) {
NList type(args);
NList attrs;
if (type.length() == 3) {
if (!type.elem(1).first().checkStreamTuple(attrs)) {
return NList::typeError(
"Argument number 1 must be of type stream(tuple(...)).");
}
if (!type.elem(2).isSymbol(1)) {
return NList::typeError("Argument number 2 must name an attribute in the "
"tuple stream given as the first argument.");
}
if (!type.elem(3).first().isSymbol(FText::BasicType())) {
return NList::typeError("Argument number 3 must be of type text.");
}
} else {
return NList::typeError("3 arguments expected but " +
std::to_string(type.length()) + " received.");
}
std::string itemsetAttrName = type.elem(2).first().str();
int itemsetAttr = -1;
for (int i = 1; i <= (int)attrs.length(); i += 1) {
NList attr = attrs.elem(i);
if (attr.elem(1).isSymbol(itemsetAttrName)) {
itemsetAttr = i;
}
}
if (itemsetAttr == -1) {
return NList::typeError("Argument number 2 must name an attribute in the "
"tuple stream given as the first argument.");
}
return NList(Symbols::APPEND(), NList().intAtom(itemsetAttr - 1).enclose(),
NList().symbolAtom(CcBool::BasicType()))
.listExpr();
}
int saveTransactionsVM(Word *args, Word &result, int message, Word &local,
Supplier s) {
result = qp->ResultStorage(s);
auto success = (CcBool *)result.addr;
auto transactions = new Stream<Tuple>(args[0]);
std::string path = ((FText *)args[2].addr)->GetValue();
int itemsetAttr = ((CcInt *)args[3].addr)->GetIntval();
std::fstream file;
file.open(path, std::ios::out);
if (file.is_open()) {
transactions->open();
Tuple *t;
while ((t = transactions->request())) {
auto itemset = (collection::IntSet *)t->GetAttribute(itemsetAttr);
for (std::size_t i = 0; i < itemset->getSize(); i += 1) {
file << itemset->get(i);
if (i + 1 != itemset->getSize()) {
file << ' ';
}
}
file << '\n';
}
transactions->close();
success->Set(true, true);
} else {
success->Set(true, false);
}
return 0;
}
extendItemNamesLI::extendItemNamesLI(
Stream<Tuple> *stream, const std::string &path,
std::vector<std::pair<int, int>> attrMapping, ListExpr tupleType)
: stream(stream), attrMapping(std::move(attrMapping)) {
this->tupleType =
new TupleType(SecondoSystem::GetCatalog()->NumericType(tupleType));
this->stream->open();
std::fstream file;
file.open(path, std::ios::in);
if (file.is_open()) {
std::string line;
while (std::getline(file, line)) {
std::istringstream lineStream(line);
std::string item, name;
std::getline(lineStream, item, ',');
std::getline(lineStream, name, '\n');
try {
this->nameMapping[std::stoi(item)] = name;
} catch (std::exception &e) {
continue;
}
}
}
}
// Returns the next tuple.
Tuple *extendItemNamesLI::getNext() {
Tuple *t;
if ((t = this->stream->request())) {
auto *nt = new Tuple(this->tupleType);
for (int i = 0; i < t->GetNoAttributes(); i += 1) {
nt->CopyAttribute(i, t, i);
}
ListExpr textSetType = SecondoSystem::GetCatalog()->NumericType(
NList(NList().symbolAtom(Set::BasicType()),
NList().symbolAtom(FText::BasicType()))
.listExpr());
for (auto [itemsetAttr, namesAttr] : this->attrMapping) {
auto names = new collection::Collection(collection::CollectionType::set,
textSetType, 10);
names->SetDefined(true);
auto itemset = (collection::IntSet *)t->GetAttribute(itemsetAttr);
for (std::size_t i = 0; i < itemset->getSize(); i += 1) {
int item = itemset->get(i);
if (this->nameMapping.count(item) > 0) {
names->Insert(new FText(true, this->nameMapping[item]), 1);
}
}
nt->PutAttribute(namesAttr, names);
}
return nt;
}
return nullptr;
}
// Type mapping for the extendItemNames operator.
ListExpr extendItemNamesTM(ListExpr args) {
NList type(args);
NList attrs;
NList newAttrs;
if (type.length() == 3) {
if (!type.elem(1).first().checkStreamTuple(attrs)) {
return NList::typeError(
"Argument number 1 must be of type stream(tuple(...)).");
}
newAttrs = attrs;
if (!type.elem(2).first().isSymbol(FText::BasicType())) {
return NList::typeError("Argument number 2 must be of type text.");
}
NList attrPairs = type.elem(3).first();
for (std::size_t i = 1; i <= attrPairs.length(); i += 1) {
if (attrPairs.elem(i).length() != 2) {
return NList::typeError(
"Argument number 3 must be a list of attribute pairs.");
}
std::string attrName = attrPairs.elem(i).second().str();
bool attrFound = false;
bool attrIsIntSet = false;
bool attrConflict = false;
for (std::size_t j = 1; j <= attrs.length(); j += 1) {
NList attr = attrs.elem(j);
if (attr.first().isSymbol(attrName)) {
attrFound = true;
attrIsIntSet =
attr.second().isSymbol(collection::IntSet::BasicType());
}
if (attr.first().isSymbol(attrPairs.elem(i).first().str())) {
attrConflict = true;
}
}
if (attrFound) {
if (!attrIsIntSet) {
return NList::typeError(
"Attribute " + attrName +
" is not an intset in the given tuple stream.");
}
} else {
return NList::typeError("Attribute " + attrName +
" not found in the given tuple stream.");
}
if (attrConflict) {
return NList::typeError("Attribute " + attrPairs.elem(i).first().str() +
" already exists in the given tuple stream.");
}
newAttrs.append(NList(attrPairs.elem(i).first(),
NList(NList().symbolAtom(Set::BasicType()),
NList().symbolAtom(FText::BasicType()))));
}
} else {
return NList::typeError("3 arguments expected but " +
std::to_string(type.length()) + " received.");
}
return NList().streamOf(NList().tupleOf(newAttrs)).listExpr();
}
// Value mapping for the extendItemNames operator.
int extendItemNamesVM(Word *args, Word &result, int message, Word &local,
Supplier s) {
auto *li = (extendItemNamesLI *)local.addr;
switch (message) {
case OPEN: {
delete li;
auto *stream = new Stream<Tuple>(args[0]);
std::string path = ((FText *)args[1].addr)->GetValue();
NList resultType(qp->GetType(s));
NList tupleType = resultType.second();
std::vector<std::pair<int, int>> attrMapping;
NList attrPairs(qp->GetType(args[2].addr));
for (std::size_t i = 1; i <= attrPairs.length(); i += 1) {
std::string namesAttrName = attrPairs.elem(i).first().str();
std::string itemsetAttrName = attrPairs.elem(i).second().str();
int namesAttr = -1;
int itemsetAttr = -1;
for (int j = 1; j <= (int)tupleType.second().length(); j += 1) {
if (tupleType.second().elem(j).first() == namesAttrName) {
namesAttr = j;
}
if (tupleType.second().elem(j).first() == itemsetAttrName) {
itemsetAttr = j;
}
}
assert(namesAttr != -1 && itemsetAttr != -1);
attrMapping.emplace_back(itemsetAttr - 1, namesAttr - 1);
};
local.addr =
new extendItemNamesLI(stream, path, attrMapping, tupleType.listExpr());
return 0;
}
case REQUEST:
result.addr = li ? li->getNext() : nullptr;
return result.addr ? YIELD : CANCEL;
case CLOSE:
delete li;
local.addr = nullptr;
return 0;
default:
return 0;
}
}
} // namespace AssociationAnalysis