Files
secondo/Algebras/Distributed5/partitionFS.cpp
2026-01-23 17:03:45 +08:00

476 lines
14 KiB
C++

/*
----
This file is part of SECONDO.
Copyright (C) 2015,
Faculty of Mathematics and Computer Science,
Database Systems for New Applications.
SECONDO is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
SECONDO is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SECONDO; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
----
//[$][\$]
*/
#include "partitionFS.h"
#include "DInputConsumer.h"
using namespace std;
using namespace distributed2;
/*
0 Functions from distributed2Algebras
*/
namespace distributed2
{
// Algebra instance
extern Distributed2Algebra *algInstance;
} // namespace distributed2
namespace distributed5
{
/*
1 partitionFS Operator
Creates a stream of tasks.
Input Parameter can be
* a stream of tasks
* D[F]Array
1.1 Type Mapping
*/
ListExpr partitionFSTM(ListExpr args)
{
string err =
"{{d[f]array(rel(X)) / stream(task(d[f]array(rel(X))))} x string x "
"(fsrel(X) -> stream(tuple(Y))) x "
"(tuple(Y) -> int) x "
"int} expected";
//ensure that exactly 5 argument comes into partitionFS
if (!nl->HasLength(args, 5))
{
return listutils::typeError(err + " (wrong number of args)");
}
ListExpr argInput = nl->First(args);
// check for internal correctness
if (!nl->HasLength(argInput, 2))
{
return listutils::typeError("internal error");
}
ListExpr argName = nl->Second(args);
ListExpr argFunMap = nl->Third(args);
ListExpr argFunPartition = nl->Fourth(args);
ListExpr argVSlots = nl->Fifth(args);
// check for internal correctness (uses Args in type mapping)
if (!nl->HasLength(argInput, 2) ||
!nl->HasLength(argName, 2) ||
!nl->HasLength(argFunMap, 2) ||
!nl->HasLength(argFunPartition, 2) ||
!nl->HasLength(argVSlots, 2))
{
return listutils::typeError("internal error");
}
ListExpr argInputType = nl->First(argInput);
// i.e. argInputType =
// (darray int) or
// (dfarray int) or
// (stream (task (darray int))) or
// (stream (task (dfarray int)))
bool inputIsDArray = DArray::checkType(argInputType);
bool inputIsDFArray = DFArray::checkType(argInputType);
bool inputIsDTaskStream = Stream<Task>::checkType(argInputType) &&
DArray::checkType(
Task::innerType(
nl->Second(argInputType)));
bool inputIsDFTaskStream = Stream<Task>::checkType(argInputType) &&
DFArray::checkType(
Task::innerType(
nl->Second(argInputType)));
bool inputIsStream = inputIsDTaskStream || inputIsDFTaskStream;
bool inputOk = inputIsDArray ||
inputIsDFArray ||
inputIsDTaskStream ||
inputIsDFTaskStream;
if (!inputOk)
{
return listutils::typeError(err + " (input invalid)");
}
ListExpr argNameType = nl->First(argName);
ListExpr argFunMapType = nl->First(argFunMap);
ListExpr argFunPartitionType = nl->First(argFunPartition);
ListExpr argVSlotsType = nl->First(argVSlots);
if (!CcString::checkType(argNameType))
{
return listutils::typeError(err + " (name type invalid)");
}
if (
!listutils::isMap<2>(argFunMapType) ||
!listutils::isMap<2>(argFunPartitionType))
{
return listutils::typeError(err + " (fun type invalid)");
}
if (!CcInt::checkType(argVSlotsType))
{
return listutils::typeError(err + " (vslots type invalid)");
}
ListExpr darrayType = Stream<Task>::checkType(argInputType)
? Task::innerType(nl->Second(argInputType))
: argInputType;
//Function argument type
ListExpr funMapArg1 = nl->Second(argFunMapType);
ListExpr funMapArg2 = nl->Third(argFunMapType);
//expected Function Argument type
ListExpr expFunMapArg = nl->TwoElemList(
listutils::basicSymbol<fsrel>(),
nl->Second(nl->Second(darrayType)));
if (!nl->Equal(expFunMapArg, funMapArg1))
{
stringstream ss;
ss << "type mismatch between map function argument"
<< " and subtype of d[f]array" << endl
<< "subtype is " << nl->ToString(expFunMapArg) << endl
<< "funarg is " << nl->ToString(funMapArg1) << endl;
return listutils::typeError(ss.str());
}
if (!nl->Equal(expFunMapArg, funMapArg2))
{
stringstream ss;
ss << "type mismatch between map function argument"
<< " and subtype of d[f]array" << endl
<< "subtype is " << nl->ToString(expFunMapArg) << endl
<< "funarg is " << nl->ToString(funMapArg2) << endl;
return listutils::typeError(ss.str());
}
//Function return type
ListExpr funMapRes = nl->Fourth(argFunMapType);
if (!Stream<Tuple>::checkType(funMapRes))
{
return listutils::typeError(err +
" (map fun must return a tuple stream)");
}
// the function definition
ListExpr funMap = nl->Second(argFunMap);
// we have to replace the given function arguments
// by the real function arguments because the
// given function argument may be a TypeMapOperator
ListExpr rfunMap = nl->FourElemList(
nl->First(funMap),
nl->TwoElemList(nl->First(nl->Second(funMap)), expFunMapArg),
nl->TwoElemList(nl->First(nl->Third(funMap)), expFunMapArg),
nl->Fourth(funMap));
// compute the subtype of the resulting array
ListExpr tupleType = nl->Second(funMapRes);
ListExpr relType = nl->TwoElemList(
listutils::basicSymbol<Relation>(), tupleType);
ListExpr funPartitionArg1 = nl->Second(argFunPartitionType);
ListExpr funPartitionArg2 = nl->Third(argFunPartitionType);
if (!nl->Equal(tupleType, funPartitionArg1))
{
stringstream ss;
ss << "type mismatch between partition function argument"
<< " and result of map function" << endl
<< "result is " << nl->ToString(tupleType) << endl
<< "funarg is " << nl->ToString(funPartitionArg1) << endl;
return listutils::typeError(ss.str());
}
if (!nl->Equal(tupleType, funPartitionArg2))
{
stringstream ss;
ss << "type mismatch between partition function argument"
<< " and result of map function" << endl
<< "result is " << nl->ToString(tupleType) << endl
<< "funarg is " << nl->ToString(funPartitionArg2) << endl;
return listutils::typeError(ss.str());
}
ListExpr funPartitionRes = nl->Fourth(argFunPartitionType);
if (!CcInt::checkType(funPartitionRes))
{
return listutils::typeError(err +
" (partition fun must return int)");
}
// the function definition
ListExpr funPartition = nl->Second(argFunPartition);
// we have to replace the given function arguments
// by the real function arguments because the
// given function argument may be a TypeMapOperator
ListExpr rfunPartition = nl->FourElemList(
nl->First(funPartition),
nl->TwoElemList(nl->First(nl->Second(funPartition)), tupleType),
nl->TwoElemList(nl->First(nl->Third(funPartition)), tupleType),
nl->Fourth(funPartition));
// create a new function with a single argument which calls the
// real function, passing the argument to both arguments
// this allows the user to choice to use . (correct) or .. (deprecated)
ListExpr newArgument = nl->SymbolAtom(
nl->SymbolValue(nl->First(nl->Third(funPartition))) +
"_");
ListExpr rfunPartitionSingleArg = nl->ThreeElemList(
nl->First(funPartition),
nl->TwoElemList(newArgument, tupleType),
nl->ThreeElemList(
rfunPartition,
newArgument,
newArgument));
// i.e. (stream (task (dfmatrix (rel ...))))
ListExpr resType = nl->TwoElemList(
listutils::basicSymbol<Stream<Task>>(),
nl->TwoElemList(
listutils::basicSymbol<Task>(),
nl->TwoElemList(
listutils::basicSymbol<DFMatrix>(),
relType)));
ListExpr appendValues = nl->ThreeElemList(
nl->BoolAtom(inputIsStream),
nl->TextAtom(nl->ToString(rfunMap)),
nl->TextAtom(nl->ToString(rfunPartitionSingleArg)));
return nl->ThreeElemList(
nl->SymbolAtom(Symbols::APPEND()),
appendValues,
resType);
}
/*
1.2 Local Information Class for the partitionFS Operator
*/
class partitionFSLI
{
public:
partitionFSLI(DInputConsumer &&input,
string mapFunction, string partitionFunction,
string remoteName, int vslots, ListExpr contentType)
: input(std::move(input)),
mapFunction(mapFunction),
partitionFunction(partitionFunction),
remoteName(remoteName),
vslots(vslots),
contentType(contentType) {}
//destructor of partitionFSLI
~partitionFSLI()
{
}
//returns the next task for the successor operator
Task *getNext()
{
if (!inputConsumed)
{
Task *inputTask = input.request();
if (inputTask != 0)
{
if (inputTask->hasFlag(Output))
{
inputTask->clearFlag(Output);
collectedTasks[inputTask->getPreferredLocation()]
.push_back(inputTask);
}
return inputTask;
}
if (vslots == 0)
{
for (auto &pair : collectedTasks)
vslots += pair.second.size();
}
inputConsumed = true;
}
if (collectedTasks.size() == 0)
return 0;
auto pair = collectedTasks.begin();
auto *partitionTask = new PartitionFunctionTask(
pair->first,
mapFunction, partitionFunction,
remoteName, vslots, contentType);
partitionTask->setFlag(Output);
for (auto task : pair->second)
{
partitionTask->addPredecessorTask(task);
}
collectedTasks.erase(pair);
return partitionTask;
}
private:
bool inputConsumed = false;
map<WorkerLocation, vector<Task *>> collectedTasks;
DInputConsumer input;
string mapFunction;
string partitionFunction;
string remoteName;
int vslots;
ListExpr contentType;
};
/*
1.3 Value Mapping for partitionFS
*/
int partitionFSVM(Word *args,
Word &result,
int message,
Word &local,
Supplier s)
{
partitionFSLI *li = (partitionFSLI *)local.addr;
switch (message)
{
case OPEN:
{
if (li)
{
delete li;
}
// Arguments are:
// input, remoteName, fnMap*, fnPartition*, vslots,
// isStream, fnMapText, fnPartitionText
CcString *incomingRemoteName = (CcString *)args[1].addr;
CcInt *incomingVSlots = (CcInt *)args[4].addr;
bool isStream = ((CcBool *)args[5].addr)->GetValue();
string mapFunction = ((FText *)args[6].addr)->GetValue();
string partitionFunction = ((FText *)args[7].addr)->GetValue();
// create a new name for the result matrix
std::string remoteName;
if (!incomingRemoteName->IsDefined() ||
incomingRemoteName->GetValue().length() == 0)
{
remoteName = algInstance->getTempName();
}
else
{
remoteName = incomingRemoteName->GetValue();
}
// check whether the name is valid
if (!stringutils::isIdent(remoteName))
{
return 0;
}
int vslots = 0;
if (incomingVSlots->IsDefined())
{
vslots = incomingVSlots->GetValue();
}
//check for all previous tasks
DInputConsumer input(
isStream
? DInputConsumer(args[0])
: DInputConsumer(
(DArrayBase *)args[0].addr,
DInputConsumer::getContentType(
qp->GetType(qp->GetSon(s, 0)))));
local.addr = li =
new partitionFSLI(std::move(input),
mapFunction,
partitionFunction,
remoteName,
vslots,
Task::resultType(nl->Second(qp->GetType(s))));
return 0;
}
case REQUEST:
result.addr = li ? li->getNext() : 0;
return result.addr ? YIELD : CANCEL;
case CLOSE:
if (li)
{
delete li;
local.addr = 0;
}
return 0;
}
return 0;
}
/*
1.4 Specification for partitionFS
*/
OperatorSpec partitionFSSpec(
"d[f]array(X)/tasks(d[f]array(X)) x string x fun x fun x int "
"-> tasks(dfmatrix(Y))",
"_ partitionFS[_,_]",
"Partitions distributed data across a vertical partitioning schema",
"");
/*
1.5 Operator partitionFS
*/
Operator partitionFSOp(
"partitionFS",
partitionFSSpec.getStr(),
partitionFSVM,
Operator::SimpleSelect,
partitionFSTM);
} // namespace distributed5