Files
secondo/Optimizer/Distributed/distributed.pl

2033 lines
62 KiB
Perl
Raw Normal View History

2026-01-23 17:03:45 +08:00
/*
//paragraph [10] title: [{\Large \bf ] [}]
//characters [1] formula: [$] [$]
//[ae] [\"{a}]
//[oe] [\"{o}]
//[ue] [\"{u}]
//[ss] [{\ss}]
//[Ae] [\"{A}]
//[Oe] [\"{O}]
//[Ue] [\"{U}]
//[**] [$**$]
//[toc] [\tableofcontents]
//[=>] [\verb+=>+]
//[:Section Translation] [\label{sec:translation}]
//[Section Translation] [Section~\ref{sec:translation}]
//[:Section 4.1.1] [\label{sec:4.1.1}]
//[Section 4.1.1] [Section~\ref{sec:4.1.1}]
//[Figure pog1] [Figure~\ref{fig:pog1.eps}]
//[Figure pog2] [Figure~\ref{fig:pog2.eps}]
//[newpage] [\newpage]
[10] Query Optimization for Distributed Query Processing
Fapra group 2015/16 and Ralf Hartmut G[ue]ting, June 2016.
[toc]
[newpage]
1 Introduction
This file contains additions to ~optimizerNewProperties~ to support query optimization for distributed processing, using the ~Distributed2Algebra~.
2 The Target Language
In the target language, we use the following additional operators:
----
dloop darray(X) x string x (X->Y) -> darray(Y)
Performs a function on each element of a darray instance.The
string argument specifies the name of the result. If the
name is undefined or an empty string, a name is generated
automatically.
dloop2 darray(X) x darray(Y) x string x (fun : X x Y -> Z) -> darray(Z)
Performs a function on the elements of two darray instances.
The string argument specifies the name of the resulting
darray. If the string is undefined or empty, a name is
generated automatically.
dmap d[f]array x string x fun -> d[f]array
Performs a function on a distributed file array. If the
string argument is empty or undefined, a name for the result
is chosen automatically. If not, the string specifies the
name. The result is of type dfarray if the function produces
a tuple stream or a relationi; otherwise the result is a
darray.
dmap2 d[f]array x d[f]array x string x fun -> d[f]array
Joins the slots of two distributed arrays.
partition d[f]array(rel(tuple)) x string x (tuple->int) x int-> dfmatrix
Redistributes the contents of a dfarray value. The new slot
contents are kept on the worker where the values were stored
before redistributing them. The last argument (int)
determines the number of slots of the redistribution. If
this value is smaller or equal to zero, the number of slots
is overtaken from the array argument.
partitionF d[f]array(rel(X)) x string x ([fs]rel(X)->stream(Y)) x (Y ->
int) x int -> dfmatrix(rel(Y))
Repartitions a distributed [file] array. Before repartition,
a function is applied to the slots.
collect2 dfmatrix x string x int -> dfarray
Collects the slots of a matrix into a dfarray. The string
is the name of the resulting array, the int value specified
a port for file transfer. The port value can be any port
usable on all workers. A corresponding file transfer server
is started automatically.
areduce dfmatrix(rel(t)) x string x (fsrel(t)->Y) x int -> d[f]array(Y)
Performs a function on the distributed slots of an array.
The task distribution is dynamically, meaning that a fast
worker will handle more slots than a slower one. The result
type depends on the result of the function. For a relation
or a tuple stream, a dfarray will be created. For other non-
stream results, a darray is the resulting type.
dsummarize darray(DATA) -> stream(DATA) , d[f]array(rel(X)) -> stream(X)
Produces a stream of the darray elements.
getValue {darray(T),dfarray(T)} -> array(T)
Converts a distributed array into a normal one.
tie ((array t) (map t t t)) -> t
Calculates the "value" of an array evaluating the elements
of the array with a given function from left to right.
----
3 Replication
To consider distributed queries with predicates containing non-relation
objects, it is necessary to replicate the objects to the
involved workers.
For now we assume that every found object is contained in the distributed
part of the query (function of dmap or dmap2).
*/
:- dynamic(replicatedObject/1).
:- dynamic(shared/1). % has already been shared once in this session
replicateObjects :- not(replicateObjects2).
replicateObjects2 :-
replicatedObject(X),
not(shared(X)),
atom_string(X, XString),
plan_to_atom(share(value_expr(string, XString), true,
dbotherobject(sec2workers)), Query),
atom_concat('query ', Query, Command),
secondo(Command),
assert(shared(X)),
fail.
/*
4 Translation of Plans
*/
plan_to_atom_string(X, Result) :-
isDistributedQuery,
retractall(replicatedObject(_)),
plan_to_atom(X, Result),
replicateObjects,
!.
plan_to_atom_string(X, Result) :-
not(isDistributedQuery),
plan_to_atom(X, Result),
!.
% preliminary translation of dproduct (before operator exists)
plan_to_atomD(dproduct(X, Y, _, Plan, Server), Result) :-
plan_to_atom(X, XAtom),
plan_to_atom(Y, YAtom),
plan_to_atom(value_expr(string, ""), S),
plan_to_atom(Plan, PlanAtom),
plan_to_atom(Server, ServerAtom),
atomic_list_concat([XAtom, ' ', YAtom, ' dproduct[', S, ', ',
PlanAtom, ', ', ServerAtom, ']'], '', Result).
% remember objects to be shared (replicated) in the distributed case,
% called dbobject
plan_to_atomD(dbobject(Name), ExtName) :-
dcName2externalName(DCname, Name), % convert to DC-spelling
( dcName2externalName(DCname, ExtName) % if Name is known
-> ( isDistributedQuery -> assertOnce(replicatedObject(ExtName)) ; true )
-> true
; ( write_list(['\nERROR:\tCannot translate \'',dbobject(DCname),'\'.']),
throw(error_Internal(optimizer_plan_to_atom(dbobject(DCname),
ExtName)::missingData)),
fail
)
),
!.
% define attributes of second argument including renaming
plan_to_atomD(our_attrname(attr(Name, Arg, Case)), Result) :-
plan_to_atomD(our_a(Name, Arg, Case), Result).
plan_to_atomD(our_a(_:B, _, _), Result) :-
upper(B, B2),
atom_concat('..', B2, Result),
!.
plan_to_atomD(our_a(X, _, _), Result) :-
upper(X, X2),
atom_concat('..', X2, Result),
!.
% just get the attribute name, regardless of renaming
plan_to_atomD(simple_attrname(attr(Name, Arg, Case)), Result) :-
plan_to_atomD(simple_a(Name, Arg, Case), Result), !.
plan_to_atomD(simple_a(_:B, _, _), B2) :-
upper(B, B2),
!.
plan_to_atomD(simple_a(X, _, _), X2) :-
upper(X, X2),
!.
%B.Huber
%additional sum functions for groupby combined aggregate methods
plan_to_atomD(sum(NewAttr, attrname(attr(Expr, X, Y))), Result) :-
plan_to_atom(attrname(attr(Expr, X, Y)), NAtom),
plan_to_atom(NewAttr, EAtom),
my_concat_atom([EAtom, ' sum[', NAtom, ']'], '', Result),
!.
plan_to_atomD(sum(NewAttr, attrname(attr(Expr1, Num1, Type1)),
attrname(attr(Expr2, Num2, Type2))), Result) :-
plan_to_atom(attrname(attr(Expr1, Num1, Type1)), NAtom1),
plan_to_atom(attrname(attr(Expr2, Num2, Type2)), NAtom2),
plan_to_atom(NewAttr, EAtom),
my_concat_atom([EAtom, ' sum[', NAtom1, '] / ',
EAtom,' sum[',NAtom2, ']'], '', Result),
!.
%B.Huber end
/*
5 Translation Rules
5.1 Translation of Arguments
Treat translation into distributed arguments. The properties we use are...
* ~distribution~ (~DistributionType~, ~DistributionAttribute~, ~DistributionParameter~):
~DistributionType~ is ~share~, ~spatial~, ~modulo~, ~function~ or ~random~.
~DistributionAttribute~ is the attribute of the relation used to determine
on which partition(s) to put a given tuple (in theory this could also be a list).
~DistributionParameter~ is the parameter used for the distribution (like grid or
function object / operator).
* ~distributedobjecttype~(~Type~) (~Type~ is ~darray~, ~dfarray~ or ~dfmatrix~).
* ~disjointpartitioning~ signals that, if we treat a partition as the multi set
of the tuples it contains, the union of all partitions is the original relation
(put differently, in as far as duplicates exist, they have been present in the
original relation).
Since some Secondo plans eliminate duplicates anyways, they can do without their
arguments having this property (e.g. spatial join).
*/
% Translate into object found in SEC2DISTRIBUTED.
distributedarg(N) translatesD [Object, P] :-
X = [distribution(DistType, DCDistAttr, DistParam),
distributedobjecttype(DistObjType)],
argument(N, Rel),
Rel = rel(DCName, _),
distributedRels(rel(DCName, _), Object, DistObjType, _,
DistType, DistAttr, DistParam),
( (DistType = spatial) -> P = X
; append(X, [disjointpartitioning], P)
),
downcase_atom(DistAttr, DCDistAttr).
/*
5.2 Translation of Selections that Concern Distributed Relations
5.2.1 Selection Without Index
*/
% Generic case. Remove duplicates if needed.
distributedselect(Arg, pr(Cond, rel(_, Var))) translatesD
[dmap(ArgA, value_expr(string, ""), filter(Plan, Cond2)), P2] :-
Arg => [ArgA, P],
% write('ArgA = '), write(ArgA), nl, nl,
% write('P = '), write(P), nl, nl,
% partitions of the argument relations need to be disjoint
( member(disjointpartitioning, P)
-> Cond2 = Cond, P2 = P
; Cond2 = and(Cond, Original),
append(P, [disjointpartitioning], P2)
),
% rename if needed
% feedRenameRelation(dot, Var, Plan),
feedRenameRelation2(Arg, dot, Var, Plan),
renamedRelAttr(attr(original, 1, u), Var, Original).
/*
5.2.2 Using a Standard Index (B-Tree)
*/
% Use btree index for a starts predicate.
distributedselect(arg(N), pr(Attr starts Val, rel(_, Var)))
translatesD [dmap2(IndexObj, RelObj, value_expr(string, ""),
Range2, 1238),
[distribution(DistType, DCDistAttr, DistParam),
distributedobjecttype(dfarray), disjointpartitioning]] :-
argument(N, rel(DCName, _)),
distributedRels(rel(DCName, _), RelObj, _, _,
DistType, DistAttr, DistParam),
( DistType = spatial
-> Range2 = filter(Range, Original) % remove duplicates
; Range2 = Range
),
downcase_atom(DistAttr, DCDistAttr),
attrnameDCAtom(Attr, DCAttr),
write('we got here'), nl, nl, nl,
% Lookup a btree index for the relation + attribute
distributedIndex(RelObj, DCAttr, btree, IndexObj),
renameStream(range(dot, dotdot, Val, increment(Val)),
Var, Range),
renamedRelAttr(attr(original, 1, u), Var, Original).
/*
5.2.3 Using a Spatial Index
*/
% Use spatial index for an intersection predicate.
distributedselect(arg(N), pr(Attr intersects Val, rel(_, Var)) )
translatesD [dmap2(IndexObj, RelObj, value_expr(string, ""),
filter(Intersection, Pred), 1238),
[distribution(DistType, DCDistAttr, DistParam),
distributedobjecttype(dfarray), disjointpartitioning]] :-
argument(N, rel(DCName, _)),
% We need a materialized argument relation to use the index
distributedRels(rel(DCName, _), RelObj, _, _,
DistType, DistAttr, DistParam),
( DistType = spatial
-> Pred = and(Attr intersects Val, Original) % remove duplicates
; Pred = (Attr intersects Val)
),
downcase_atom(DistAttr, DCDistAttr),
% Lookup an rtree index for the relation + attribute
attrnameDCAtom(Attr, DCAttr),
distributedIndex(RelObj, DCAttr, rtree, IndexObj),
renameStream(windowintersects(dot, dotdot, Val),
Var, Intersection),
renamedRelAttr(attr(original, 1, u), Var, Original).
/*
5.3 Distributed Join
5.3.1 Distributed Generic Join
*/
% Asymmetric, different orders have different costs.
distributedjoin(Arg1, Arg2, pr(Pred, rel(_, Var1), rel(_, Var2)))
translatesD [Plan, P] :-
Arg1 => [Arg1A, _], % Arg1A = Arg1Array
Arg2 => [Arg2A, _],
Plan = dproduct(Arg1A, Arg2A, e, symmjoin(Arg1S, Arg2S, Pred), 1238),
P = [distribution(random, *, *), distributedobjecttype(dfarray)],
feedRenameRelation2(Arg1, dot, Var1, Arg1S), % Arg1S = Arg1Stream
feedRenameRelation2(Arg2, dotdot, Var2, Arg2S).
distributedjoin(Arg1, Arg2, pr(Pred, rel(_, Var1), rel(_, Var2)))
translatesD [Plan, P] :-
Arg1 => [Arg1A, _],
Arg2 => [Arg2A, _],
Plan = dproduct(Arg2A, Arg1A, e, symmjoin(Arg2S, Arg1S, Pred), 1238),
P = [distribution(random, *, *), distributedobjecttype(dfarray)],
feedRenameRelation2(Arg2, dot, Var2, Arg1S),
feedRenameRelation2(Arg1, dotdot, Var1, Arg2S).
/*
5.3.2 Equijoin
*/
% Both arguments are NOT distributed for this equijoin. Redistribute both and
% use areduce.
distributedjoin(Arg1, Arg2, pr(X = Y, rel(_, Var1), rel(_, Var2)))
translatesD [Plan, P] :-
Arg1 => [Arg1A, P1],
Arg2 => [Arg2A, P2],
X = attr(_, _, _),
Y = attr(_, _, _),
isOfFirst(Attr1, X, Y),
isOfSecond(Attr2, X, Y),
attrnameDCAtom(Attr1, DCAttr1),
attrnameDCAtom(Attr2, DCAttr2),
% repartition each argument
not(member(distribution(_, DCAttr1, _), P1)),
not(member(distribution(_, DCAttr2, _), P2)),
write('we get here. '), nl, nl,
feedRenameRelation2(Arg1, dot, Var1, Arg1S),
feedRenameRelation2(Arg2, dotdot, Var2, Arg2S),
InnerPlan = hashjoin(Arg1S, Arg2S, attrname(Attr1), attrname(Attr2), 999997),
Plan = areduce2(
partitionF(Arg1A, value_expr(string, ""), feed(dot),
hashvalue(our_attrname(X), 999997), 0),
partitionF(Arg2A, value_expr(string, ""), feed(dot),
hashvalue(our_attrname(X), 999997), 0),
value_expr(string, ""),
InnerPlan, 1238),
write('The plan is (areduce2): '), write(Plan), nl,
P = [distribution(function, DCAttr1, hash),
distribution(function, DCAttr2, hash),
distributedobjecttype(dfarray)].
% At least one argument is already distributed for this equijoin. Distribute
% the other one if needed.
distributedjoin(Arg1, Arg2, pr(X = Y, rel(_, Var1), rel(_, Var2)))
translatesD [Plan, P] :-
Arg1 => [Arg1A, P1],
Arg2 => [Arg2A, P2],
X = attr(_, _, _),
Y = attr(_, _, _),
isOfFirst(Attr1, X, Y),
isOfSecond(Attr2, X, Y),
attrnameDCAtom(Attr1, DCAttr1),
attrnameDCAtom(Attr2, DCAttr2),
% repartition each argument if necessary
( member(distribution(_, DCAttr1, _), P1) -> Arg1B = Arg1A
;
Arg1B = collect2(
partitionF(Arg1A, value_expr(string, ""), feed(dot),
hashvalue(our_attrname(X), 999997), 0),
value_expr(string, ""), 1238)
),
( member(distribution(_, DCAttr2, _), P2) -> Arg2B = Arg2A
;
Arg2B = collect2(
partitionF(Arg2A, value_expr(string, ""), feed(dot),
hashvalue(our_attrname(X), 999997), 0),
value_expr(string, ""), 1238)
),
InnerPlan = hashjoin(Arg1S, Arg2S, attrname(Attr1), attrname(Attr2), 999997),
Plan = dmap2(Arg1B, Arg2B, value_expr(string, ""), InnerPlan, 1238),
P = [distribution(function, DCAttr1, hash),
distribution(function, DCAttr2, hash),
distributedobjecttype(dfarray)],
feedRenameRelation2(Arg1, dot, Var1, Arg1S), % Arg1S = Arg1Stream
feedRenameRelation2(Arg2, dotdot, Var2, Arg2S).
/*
5.3.3 Spatial Join
*/
% Distribute arguments for spatial join as needed.
distributedjoin(Arg1, Arg2, pr(X intersects Y, rel(_, Var1), rel(_, Var2)))
translatesD [Plan, P] :-
Arg1 => [Arg1A, P1],
Arg2 => [Arg2A, P2],
X = attr(_, _, _),
Y = attr(_, _, _),
isOfFirst(Attr1, X, Y),
isOfSecond(Attr2, X, Y),
attrnameDCAtom(Attr1, DCAttr1),
attrnameDCAtom(Attr2, DCAttr2),
unrenamedAttr(Attr1, Attr1u),
unrenamedAttr(Attr2, Attr2u),
% repartition each argument if necessary
( member(distribution(_, DCAttr1, _), P1) -> Arg1B = Arg1A
;
Arg1B = collect2(
partitionF(Arg1A, value_expr(string, ""),
extendstream(feed(dot), field(attr(cell, 1, u),
cellnumber(bbox(Attr1u), grid))),
CellDistAttr1, 0),
value_expr(string, ""), 1238)
),
( member(distribution(_, DCAttr2, _), P2) -> Arg2B = Arg2A
;
Arg2B = collect2(
partitionF(Arg2A, value_expr(string, ""),
extendstream(feed(dot), field(attr(cell, 2, u),
cellnumber(bbox(Attr2u), grid))),
CellDistAttr2, 0),
value_expr(string, ""), 1238)
),
% rename the cell attribute if needed
renamedRelAttr(attr(cell, 1, u), Var1, CellAttr1),
renamedRelAttr(attr(cell, 2, u), Var2, CellAttr2),
renamedRelAttr2(Arg1, attr2(cell, 1, u), Var1, CellDistAttr1),
renamedRelAttr2(Arg2, attr2(cell, 2, u), Var2, CellDistAttr2),
InnerPlan =
filter(
itSpatialJoin(Arg1S, Arg2S, attrname(Attr1), attrname(Attr2)),
((CellAttr1 = CellAttr2) and (X intersects Y)) and
gridintersects(grid, bbox(X), bbox(Y), CellAttr1) ),
Plan = dmap2(Arg1B, Arg2B, value_expr(string, ""), InnerPlan, 1238),
P = [distribution(spatial, DCAttr1, grid),
distribution(spatial, DCAttr2, grid),
distributedobjecttype(dfarray)],
feedRenameRelation2(Arg1, dot, Var1, Arg1S),
feedRenameRelation2(Arg2, dotdot, Var2, Arg2S).
/*
6 Cost Functions
---- costD(+Arg, +Sel, +Pred, -Size, -NSlots, -Cost) :-
----
The cost of argument ~Arg~ for given selectivity ~Sel~ and predicate ~Pred~ is ~Cost~, the resulting dfarray has ~NSlots~ slots, each with a relation of size ~Size~.
Costs are in microseconds, as for standard cost estimation.
6.1 Preliminary Definitions for Cost Estimation
The following functions are used for cost- and costs2014-methods.
*/
%B.Huber
%moveCost(NSlots, Size, X) :-
% X is NSlots * Size * 10.0. % Wild guess, means ten seconds
% for a million tuples.
% To be studied, also tuple size plays a role.
/*
The method moveCost is for the dproduct command of the distributed2Algebra and is used
for cost and costs2014 functions.
In the parameter ~NSlotsX~ is the number of slots for the first df- or darray in dproduct
and ~NSlotsY~ is the second array.
For the cost only size the second array is necessary, this is assigned in parameter
~SizeY~.
The cost constant is defined in milliseconds and the existing cost functions not
costs2014 work with microseconds. For this reason we need the parameter ~Factor~
to get the right costs for both unity.
The result of costs are returned in parameter ~X~.
*/
moveCost(NSlotsX, NSlotsY, SizeY, Factor, X) :-
dproductC(PerTupelMS),
PerTupel is PerTupelMS * Factor,
X is NSlotsX * NSlotsY * SizeY * PerTupel.
%B.Huber end
%B.Huber
%Old method without any cost constants
%partitionCost(NSlots, Size, X) :-
% nWorkers(NWorkers),
% NRounds is (NSlots // NWorkers) + 1,
% PerSlot is Size * 2.0,
% X is NRounds * PerSlot. % Guess, means two seconds are needed to
% distribute a million tuples on one worker.
% To be studied.
/*
The method partitionCost is for the partitionF command of the distributed2Algebra and is used
for cost and costs2014 functions.
In the parameter ~NWorkers~ is the number of workers and ~NSlots~ the slots for the
relation with the number of tupels in parameter ~Size~.
The cost constant is defined in milliseconds and the existing cost functions not
costs2014 work with microseconds. For this reason we need the parameter ~Factor~
to get the right costs for both unity.
The result of costs are returned in parameter ~X~.
*/
partitionCost(NWorkers, NSlots, Size, Factor, X) :-
partitionFC(PerTupelMS),
PerTupel is PerTupelMS * Factor,
NRounds is (NSlots // NWorkers) + 1,
PerSlot is Size * PerTupel,
X is NRounds * PerSlot.
%B.Huber end
%B.Huber
%collectCost(NSlots, Size, X) :-
% X is NSlots * Size * 2.0. % Guess, means two seconds are needed to
% read a column with a million tuples.
/*
The method collectCost is for the collect2 command of the distributed2Algebra and is used
for cost and costs2014 functions.
In the parameter ~NWorkers~ is the number of workers and ~NSlots~ the slots for the
relation with the number of tupels in parameter ~Size~.
The cost constant is defined in milliseconds and the existing cost functions not
costs2014 work with microseconds. For this reason we need the parameter ~Factor~
to get the right costs for both unity.
The result of costs are returned in parameter ~X~.
*/
collectCost(NWorkers, NSlots, Size, Factor, X) :-
collect2C(PerRoundMS),
PerRound is PerRoundMS * Factor,
NRounds is (NSlots // NWorkers) + 1,
SumRounds is (NSlots * Size) / NRounds,
X is SumRounds * PerRound.
/*
The method areduce2DMapCost is for the areduce2 command of the distributed2Algebra
and is used for cost and costs2014 functions.
In the parameter ~NWorkers~ is the number of workers and the number of tupels
contains the parameter ~Size~.
The cost constant is defined in milliseconds and the existing cost functions not
costs2014 work with microseconds. For this reason we need the parameter ~Factor~
to get the right costs for both unity.
The result of costs are returned in parameter ~X~.
*/
areduce2DMapCost(NWorkers, Size, Factor, X) :-
areduce2C(PerRoundMS),
PerRound is PerRoundMS * Factor,
SumRounds is Size / NWorkers,
X is SumRounds * PerRound.
%B.Huber end
%B.Huber
/*
6.2 Additional functions for the cost methods.
*/
%nWorkers(14). % Preliminary.
%Get the number of workers from table SEC2WORKERS
:- dynamic countDWorkers/1.
nWorkers(Cnt) :-
countDWorkers(Cnt),
!.
nWorkers(Cnt) :-
secondo('query SEC2WORKERS count',[_, Cnt]),
Cnt2 is Cnt,
( Cnt2 < 1
-> nl,nl,write('No Entries in table SEC2WORKERS'),nl,nl,
write('Please configure workers!'),nl,nl
; assert(countDWorkers(Cnt2))
),
!.
% Get the memory for one worker, based on the memory of the master.
% In the future is necessary to get this param dynamic.
getMemoryOneDistributedWorker(MemoryMaster, Workers, MemoryOneWorker) :-
% At testsystem we use 3072 MB Memory and
% for Workers we have normally 3072 x 6 Memory for all Worker.
% For 36 Workers, each Worker get 3072 * 6 : 36 = 512 MB.
MemoryOneWorker is ( MemoryMaster * 6 ) / Workers.
%B.Huber end
/*
6.3 Arguments
*/
% Distributed base object
costD(Obj, _, _, Size, NSlots, 0) :-
distributedRels(rel(DCName, _), Obj, _, NSlots, _, _, _),
cost(rel(DCName, _), _, _, RelSize, _),
Size is RelSize / NSlots.
% Intermediate result
costD(res(N), _, _, Size, NSlots, 0) :-
cost(res(N), _, _, ResSize, _),
nslots(N, NSlots),
Size is ResSize / NSlots.
/*
6.4 dproduct
Product moves for each slot ~i~ of the first argument all slots of the second argument to the worker processing slot ~i~. There it applies the parameter plan to the relation of slot ~i~ and the complete relation of the second argument.
*/
costD(dproduct(X, Y, _, Plan, _), Sel, Pred, Size, NSlots, Cost) :-
costD(X, 1, _, SizeX, NSlots, CostX),
% SizeX is the number of tuples per slot.
% NSlots is the number of slots of the distributed array.
% CostX is the cost to produce this argument.
costD(Y, 1, _, SizeY, NSlots, CostY),
SizeB is SizeY * NSlots,
substituteSubterm(dot, dot(SizeX), Plan, Plan1),
substituteSubterm(dotdot, dotdot(SizeB), Plan1, Plan2),
write('Plan cost product = '), write(Plan2), nl, nl,
cost(Plan2, Sel, Pred, Size, PlanCost),
%moveCost(NSlots, SizeB, CMove), %B.Huber
moveCost(NSlots, NSlots, SizeY, 1000, CMove), %B.Huber
nWorkers(NWorkers),
NRounds is (NSlots // NWorkers) + 1,
Cost is CostX + CostY + CMove + NRounds * PlanCost.
/*
6.4 dmap, dmap2
*/
costD(dmap(X, _, Plan), Sel, Pred, Size, NSlots, Cost) :-
costD(X, 1, _, SizeX, NSlots, CostX),
substituteSubterm(dot, dot(SizeX), Plan, Plan2),
write('Plan cost dmap = '), write(Plan2), nl, nl,
write('Sel = '), write(Sel), nl,
write('Pred = '), write(Pred), nl, nl,
cost(Plan2, Sel, Pred, Size, PlanCost),
nWorkers(NWorkers),
NRounds is (NSlots // NWorkers) + 1,
Cost is CostX + NRounds * PlanCost.
% cost for join operations
costD(dmap2(X, Y, _, Plan, _), Sel, Pred, Size, NSlots, Cost) :-
costD(X, 1, _, SizeX, NSlots, CostX),
costD(Y, 1, _, SizeY, NSlots, CostY),
substituteSubterm(dot, dot(SizeX), Plan, Plan1),
substituteSubterm(dotdot, dotdot(SizeY), Plan1, Plan2),
write('Plan cost dmap2 = '), write(Plan2), nl,
write('Sel = '), write(Sel), nl,
write('Pred = '), write(Pred), nl, nl,
Sel2 is Sel * NSlots,
cost(Plan2, Sel2, Pred, Size, PlanCost),
write('The cost of the inner plan is: '), write(PlanCost), nl,
nWorkers(NWorkers),
NRounds is (NSlots // NWorkers) + 1,
Cost is CostX + CostY + NRounds * PlanCost.
% cost for index access. Index is first argument, relation is second.
costD(dmap2(_, X, _, Plan, _), Sel, Pred, Size, NSlots, Cost) :-
costD(X, 1, _, SizeX, NSlots, CostX),
substituteSubterm(dotdot, dotdot(SizeX), Plan, Plan2),
write('Plan cost dmap2 = '), write(Plan2), nl,
write('Sel = '), write(Sel), nl,
write('Pred = '), write(Pred), nl, nl,
Sel2 is Sel,
cost(Plan2, Sel2, Pred, Size, PlanCost),
write('The cost of the inner plan is: '), write(PlanCost), nl,
nWorkers(NWorkers),
NRounds is (NSlots // NWorkers) + 1,
Cost is CostX + NRounds * PlanCost.
/*
6.5 partitionF, collect2
*/
costD(partitionF(X, _, _, _, _), _, _, SizeX, NSlots, Cost) :-
costD(X, 1, _, SizeX, NSlots, CostX),
nWorkers(NWorkers), %B.Huber
%partitionCost(NSlots, SizeX, Cost2) %B.Huber
partitionCost(NWorkers, NSlots, SizeX, 1000, Cost2), %B.Huber
Cost is CostX + Cost2.
costD(collect2(X, _, _), _, _, SizeX, NSlots, Cost) :-
costD(X, 1, _, SizeX, NSlots, CostX),
%NRounds is (NSlots // NWorkers) + 1, %B.Huber
nWorkers(NWorkers),
%collectCost(NSlots, SizeX, Cost2), %B.Huber
collectCost(NWorkers, NSlots, SizeX, 1000, Cost2), %B.Huber
%Cost is CostX + NRounds * Cost2. %B.Huber
Cost is CostX + Cost2. %B.Huber
/*
6.6 areduce2
*/
costD(areduce2(X, Y, _, Plan, _), Sel, Pred, Size, NSlots, Cost) :-
costD(X, 1, _, SizeX, NSlots, CostX),
costD(Y, 1, _, SizeY, NSlots, CostY),
nWorkers(NWorkers), %B.Huber
%collectCost(NSlots, SizeX, CostX1), %B.Huber
%collectCost(NSlots, SizeY, CostY1), %B.Huber
collectCost(NWorkers, NSlots, SizeX, 1000, CostX1), %B.Huber
collectCost(NWorkers, NSlots, SizeY, 1000, CostY1), %B.Huber
areduce2DMapCost(NWorkers, SizeX, 1000, CostRed2X1), %B.Huber
areduce2DMapCost(NWorkers, SizeY, 1000, CostRed2Y1), %B.Huber
substituteSubterm(dot, dot(SizeX), Plan, Plan1),
substituteSubterm(dotdot, dotdot(SizeY), Plan1, Plan2),
write('Plan2 areduce2 = '), write(Plan2), nl, nl,
Sel2 is Sel * NSlots,
cost(Plan2, Sel2, Pred, Size, PlanCost),
%nWorkers(NWorkers), %B.Huber
NRounds is (NSlots // NWorkers) + 1,
%Cost is CostX + CostY + NRounds * (CostX1 + CostY1 + PlanCost). %B.Huber
Cost is CostX + CostY + CostX1 + CostY1 %B.Huber
+ CostRed2X1 + CostRed2Y1 + NRounds * PlanCost. %B.Huber
%B.Huber
/*
7 Cost 2014 Functions
---- costD(+Arg, +Sel, +Pred, +Result, +Memory, +MemWorker, +Workers,
-Card, -NAttrs, -TupleSize, -NSlots, -Cost) :-
----
The costs2014 functions for executable
The cost of argument ~Arg~ for given selectivity ~Sel~ and predicate ~Pred~ and the result will
belong to the node ~Result~. The param ~Memory~ belong to master and ~MemWorkers~ is the memory of
one worker in the cluster. In the input parameter ~Workers~ contains the total number
of workers based on the table SEC2WORKERS and the number of tuples is ~Card~.
The resulting dfarray or darray was distributed for an relation, this relations has ~NAttrs~
and the specific ~TupleSize~. The dfarray or darray is distributed with ~NSlots~ slots,
and the variable ~Cost~ contains the sum of costs for the arguments which call the
costs2014 functions rekursive.
Similar to costs2014 work the following costs functions with milliseconds.
The parameters for costs functions of distributed commands are the same as for
the existing costs2014 functions. For distributed functions the parameters ~MemWorker~,
~Workers~ and ~NSlots~ are additional registered.
7.1 Arguments
*/
% Distributed base object for costs2014
costD(Obj, _, _, _, _, _, _, Size, NAttrs, TSize, NSlots, 0) :-
distributedRels(rel(Relation, _), Obj, _, NSlots, _, _, _),
card(Relation, RelSize),
Size is RelSize / NSlots,
tupleSizeSplit(Relation, TSize),
getRelAttrList(Relation, OrigAttrs, _),
length(OrigAttrs, NAttrs).
% Intermediate result for costs2014
costD(res(N), _, _, _, _, _, _, Size, NAttrs, TSize, NSlots, 0) :-
cost(res(N), _, _, _, _, Card, NAttrs, TSize, _),
nslots(N, NSlots),
Size is Card / NSlots.
/*
7.2 dproduct (costs2014)
Product moves for each slot ~i~ of the first argument all slots of the second argument to the worker processing slot ~i~. There it applies the parameter plan to the relation of slot ~i~ and the complete relation of the second argument.
*/
costD(dproduct(X, Y, _, Plan, _), Sel, Pred, Result, Memory, MemWorker,
Workers, Size, NAttrs, TSize, _, Cost) :-
costD(X, 1, Pred, Result, Memory, MemWorker,
Workers, SizeX, NAttrsX, TSizeX, NSlotsX, CostX),
% SizeX is the number of tuples per slot.
% NSlotsX is the number of slots of the distributed array.
% CostX is the cost to produce this argument.
costD(Y, 1, Pred, Result, Memory, MemWorker,
Workers, SizeY, NAttrsY, TSizeY, NSlotsY, CostY),
SizeB is SizeY * NSlotsY,
substituteSubterm(dot, dot(SizeX, NAttrsX, TSizeX), Plan, Plan1),
substituteSubterm(dotdot, dotdot(SizeB, NAttrsY, TSizeY), Plan1, Plan2),
write('Plan cost product = '), write(Plan2), nl, nl,
cost(Plan2, Sel, Pred, Result, MemWorker, Size, NAttrs, TSize, PlanCost),
moveCost(NSlotsX, NSlotsY, SizeY, 1, CMove),
NRounds is (NSlotsX // Workers) + 1,
Cost is CostX + CostY + CMove + NRounds * PlanCost.
/*
7.3 dmap, dmap2 (costs2014)
*/
costD(dmap(X, _, Plan), Sel, Pred, Result, Memory, MemWorker,
Workers, Size, NAttrs, TSize, NSlots, Cost) :-
costD(X, 1, _, _, Memory, MemWorker,
Workers, SizeX, NAttrs, TSize, NSlots, CostX),
substituteSubterm(dot, dot(SizeX, NAttrs, TSize), Plan, Plan2),
write('Plan cost dmap = '), write(Plan2), nl, nl,
write('Sel = '), write(Sel), nl,
write('Pred = '), write(Pred), nl, nl,
cost(Plan2, Sel, Pred, Result, MemWorker, Size, NAttrs, TSize, PlanCost),
NRounds is (NSlots // Workers) + 1,
dmapC(PerRound),
SumRound is SizeX / Workers,
SumRoundCost is SumRound * PerRound,
Cost is CostX + NRounds * PlanCost + SumRoundCost.
% cost for join operations
costD(dmap2(X, Y, _, Plan, _), Sel, Pred, Result, Memory, MemWorker,
Workers, Size, NAttrs, TSize, _, Cost) :-
costD(X, 1, _, _, Memory, MemWorker,
Workers, SizeX, NAttrsX, TSizeX, _, CostX),
costD(Y, 1, _, _, Memory, MemWorker,
Workers, SizeY, NAttrsY, TSizeY, NSlotsY, CostY),
substituteSubterm(dot, dot(SizeX, NAttrsX, TSizeX), Plan, Plan1),
substituteSubterm(dotdot, dotdot(SizeY, NAttrsY, TSizeY), Plan1, Plan2),
write('Plan cost dmap2 = '), write(Plan2), nl,
write('Sel = '), write(Sel), nl,
write('Pred = '), write(Pred), nl, nl,
Sel2 is Sel * NSlotsY,
cost(Plan2, Sel2, Pred, Result, MemWorker, Size, NAttrs, TSize, PlanCost),
write('The cost of the inner plan (costs2014) is: '), write(PlanCost), nl,
NRounds is (NSlotsY // Workers) + 1,
dmap2C(PerRound),
CountRounds is ( SizeX / Workers ) + ( SizeY / Workers ),
CostSumRounds is CountRounds * PerRound,
Cost is CostX + CostY + NRounds * PlanCost + CostSumRounds.
% cost for index access. Index is first argument, relation is second.
costD(dmap2(_, X, _, Plan, _), Sel, Pred, Result, Memory, MemWorker,
Workers, Size, NAttrs, TSize, NSlots, Cost) :-
costD(X, 1, _, _, Memory, MemWorker,
Workers, SizeX, NAttrsX, TSizeX, NSlots, CostX),
substituteSubterm(dotdot, dotdot(SizeX, NAttrsX, TSizeX), Plan, Plan2),
write('Plan cost dmap2 = '), write(Plan2), nl,
write('Sel = '), write(Sel), nl,
write('Pred = '), write(Pred), nl, nl,
Sel2 is Sel,
cost(Plan2, Sel2, Pred, Result, MemWorker, Size, NAttrs, TSize, PlanCost),
write('The cost of the inner plan (costs2014) is: '), write(PlanCost), nl,
NRounds is (NSlots // Workers) + 1,
dmap2C(PerRound),
CountRounds is SizeX / Workers,
CostSumRounds is CountRounds * PerRound,
Cost is CostX + NRounds * PlanCost + CostSumRounds.
/*
7.4 partitionF, collect2 (costs2014)
*/
costD(partitionF(X, _, _, _, _), _, Pred, Result, Memory, MemWorker,
Workers, SizeX, NAttrs, TSize, NSlots, Cost) :-
costD(X, 1, Pred, Result, Memory, MemWorker,
Workers, SizeX, NAttrs, TSize, NSlots, CostX),
partitionCost(Workers, NSlots, SizeX, 1, Cost2),
Cost is CostX + Cost2.
costD(collect2(X, _, _), _, Pred, Result, Memory, MemWorker,
Workers, SizeX, NAttrs, TSize, NSlots, Cost) :-
costD(X, 1, Pred, Result, Memory, MemWorker,
Workers, SizeX, NAttrs, TSize, NSlots, CostX),
collectCost(Workers, NSlots, SizeX, 1, Cost2),
Cost is CostX + Cost2.
/*
7.5 areduce2 (costs2014)
*/
costD(areduce2(X, Y, _, Plan, _), Sel, Pred, Result, Memory, MemWorker,
Workers, Size, NAttrs, TSize, _, Cost) :-
costD(X, 1, Pred, Result, Memory, MemWorker,
Workers, SizeX, NAttrsX, TSizeX, NSlotsX, CostX),
costD(Y, 1, Pred, Result, Memory, MemWorker,
Workers, SizeY, NAttrsY, TSizeY, NSlotsY, CostY),
collectCost(Workers, NSlotsX, SizeX, 1, CostX1),
collectCost(Workers, NSlotsY, SizeY, 1, CostY1),
areduce2DMapCost(Workers, SizeX, 1, CostRed2X1),
areduce2DMapCost(Workers, SizeY, 1, CostRed2Y1),
substituteSubterm(dot, dot(SizeX, NAttrsX, TSizeX), Plan, Plan1),
substituteSubterm(dotdot, dotdot(SizeY, NAttrsY, TSizeY), Plan1, Plan2),
write('Plan2 areduce2 = '), write(Plan2), nl, nl,
Sel2 is Sel * NSlotsX,
cost(Plan2, Sel2, Pred, Result, MemWorker, Size, NAttrs, TSize, PlanCost),
NRounds is (NSlotsX // Workers) + 1,
Cost is CostX + CostY + CostX1 + CostY1
+ CostRed2X1 + CostRed2Y1 + NRounds * PlanCost.
%B.Huber end
/*
8 Combining Sequential and Distributed Operations
The plan created by conjunctive query optimization consists of distributed operations. These are followed by sequential operations such as projection, groupby, etc. This version of plan is not yet correct.
The following predicates transform a mixed distributed and sequential plan into a correct distributed plan closed by sequential operations.
*/
/*
---- transformDPlan(+Plan, -Plan2) :-
----
Transform a preliminary plan ~Plan~ into a ~Plan2~ composed correctly of distributed and sequential operations.
*/
% special treatment of pure counting query on a relation, plan is
% already finished by queryToPlan.
transformDPlan(Plan, Plan) :-
Plan = tie(_, _).
transformDPlan(Plan, Plan2) :-
write('Here is the plan to be transformed: '), nl,
write(Plan), nl, nl,
transform2DPlan(Plan, DistributedPlan, SequentialPlan),
nl, write('The distributed plan is: '), nl,
nl, write(DistributedPlan), nl,
nl, write('The sequential plan is: '), nl,
nl, write(SequentialPlan), nl,
combinePlans(DistributedPlan, SequentialPlan, Plan2),
nl, write('The resulting plan is: '), nl,
nl, write(Plan2), nl, nl, nl.
/*
---- transform2DPlan(+Plan, -DistributedPlan, -SequentialPlan) :-
----
Transform a given preliminary mixed plan ~Plan~ into a distributed and a sequential plan, which still need to be combined. Combining means to embed the distributed plan as an initial part of the sequential one.
The predicate recursively processes the given mixed plan, adding for each operation the respective distributed and/or sequential operations as appropriate.
*/
transform2DPlan(consume(Plan), DistPlan, consume(SeqPlan)) :-
writeln('transform2DPlan 1'),
transform2DPlan(Plan, DistPlan, SeqPlan),
!.
%B.Huber
transform2DPlan(predinfo(Plan, _, _), DistPlan, SeqPlan) :-
transform2DPlan(Plan, DistPlan, SeqPlan),
!.
%B.Huber end
%B.Huber
transform2DPlan(project(groupby(Plan, GroupAttrs,GroupFields), _),
DistPlan, SeqPlan) :-
Plan2 = groupby(Plan, GroupAttrs,GroupFields),
transform2DPlan(Plan2, DistPlan, SeqPlan),
!.
%B.Huber end
transform2DPlan(project(Plan, Attrs),
dmap(DistPlan, value_expr(string, ""), project(feed(dot), Attrs)),
SeqPlan) :-
transform2DPlan(Plan, DistPlan, SeqPlan),
!.
%B.Huber
/*
transform2DPlanGroupBy(+Plan, -DistributedPlan, -SequentialPlan) :-
For aggregate function combined with groupby is necessary to implement special transform
rubles.
When a groupby command is in the variable Plan, the transform2DPlanGroupBy do the
transform to the special syntax which is needed for distributed databases.
*/
transform2DPlanGroupBy([], _, _) :- !.
transform2DPlanGroupBy(field(attr(Attr,Number,Type), count(feed(group))),
GroupAttrsDist, GroupAttrsSeq) :-
GroupAttrsDist = [field(attr(Attr,Number,Type), count(feed(group)))],
GroupAttrsSeq = [field(attr(Attr,Number,Type),
sum(feed(group),attrname(attr(Attr,Number,Type))))],
!.
transform2DPlanGroupBy(field(attr(Synonym,Number,Type),
avg(feed(group),attrname(TableAttr))),
GroupAttrsDist, GroupAttrsSeq) :-
newVariable(VarNew1),
newVariable(VarNew2),
append([field(attr(VarNew1,Number,Type),
sum(feed(group),attrname(TableAttr)))],
[field(attr(VarNew2,Number,Type), count(feed(group)))],
GroupAttrsDist),
GroupAttrsSeq = [field(attr(Synonym,Number,Type),
sum(feed(group),attrname(attr(VarNew1,Number2,Type2)),
attrname(attr(VarNew2,Number2,Type2))))],
!.
transform2DPlanGroupBy(field(attr(Attr,Number,Type),
sum(feed(group), attrname(TableAttr))),
GroupAttrsDist, GroupAttrsSeq) :-
GroupAttrsDist = [field(attr(Attr, Number, Type),
sum(feed(group),attrname(TableAttr)))],
GroupAttrsSeq = [field(attr(Attr, Number, Type),
sum(feed(group),attrname(attr(Attr,Number,Type))))],
!.
transform2DPlanGroupBy(field(attr(Attr,Number,Type),
min(feed(group), attrname(TableAttr))),
GroupAttrsDist, GroupAttrsSeq) :-
GroupAttrsDist = [field(attr(Attr, Number, Type),
min(feed(group),attrname(TableAttr)))],
GroupAttrsSeq = [field(attr(Attr, Number, Type),
sum(feed(group),attrname(attr(Attr, _, _))))],
!.
transform2DPlanGroupBy(field(attr(Attr,Number,Type),
max(feed(group), attrname(TableAttr))),
GroupAttrsDist, GroupAttrsSeq) :-
GroupAttrsDist = [field(attr(Attr, Number, Type),
max(feed(group),attrname(TableAttr)))],
GroupAttrsSeq = [field(attr(Attr, Number, Type),
sum(feed(group),attrname(attr(Attr, _, _))))],
!.
transform2DPlanGroupBy([Head], GroupAttrsDist, GroupAttrsSeq) :-
transform2DPlanGroupBy(Head, GroupAttrsDist, GroupAttrsSeq),
!.
transform2DPlanGroupBy([Head | Rest], GroupAttrsDist, GroupAttrsSeq) :-
transform2DPlanGroupBy(Head, GroupAttrsDistHead, GroupAttrsSeqHead),
transform2DPlanGroupBy(Rest, GroupAttrsDistRest, GroupAttrsSeqRest),
append(GroupAttrsDistHead, GroupAttrsDistRest, GroupAttrsDist),
append(GroupAttrsSeqHead, GroupAttrsSeqRest, GroupAttrsSeq),
!.
transform2DPlanGroupBy(
sortby(predinfo(dmap(Relation, Expr, FeedExpr), _, _), SortAttrs),
GroupAttrs, Fields,
DistPlan, SeqPlan) :-
transform2DPlanGroupBy(sortby(dmap(Relation, Expr, FeedExpr), SortAttrs),
GroupAttrs, Fields,
DistPlan, SeqPlan),
!.
transform2DPlanGroupBy(sortby(dmap(Relation, Expr, FeedExpr), SortAttrs),
GroupAttrs, Fields, DistPlan, SeqPlan) :-
transform2DPlanGroupBy(Fields, FieldsDist, FieldsSeq),
DistPlan = dmap(Relation, Expr,
groupby(sortby(FeedExpr, SortAttrs), GroupAttrs, FieldsDist) ),
SeqPlan = groupby(sortby(_, SortAttrs), GroupAttrs, FieldsSeq),
!.
%switch from transform2DPlan to transform2DPlanGroupBy
transform2DPlan(groupby(Plan, GroupAttrs, Fields), DistPlan, SeqPlan) :-
transform2DPlanGroupBy(Plan, GroupAttrs, Fields, DistPlan, SeqPlan),
!.
%default: transfer all data to master an do group, this is a standard case,
% when no groupby rule is for transfer is found.
transform2DPlan(groupby(Plan, GroupAttrs, Fields), DistPlan,
groupby(SeqPlan, GroupAttrs, Fields)) :-
transform2DPlan(Plan, DistPlan, SeqPlan),
!.
%B.Huber end
transform2DPlan(extend(Plan, NewAttrs),
dmap(DistPlan, value_expr(string, ""), extend(feed(dot), NewAttrs)),
SeqPlan) :-
transform2DPlan(Plan, DistPlan, SeqPlan),
!.
transform2DPlan(head(sortby(Plan, Args), N),
dmap(DistPlan, value_expr(string, ""),
head(sortby(feed(dot), Args), N)),
head(sortby(SeqPlan, Args), N)) :-
transform2DPlan(Plan, DistPlan, SeqPlan),
!.
transform2DPlan(head(Plan, N),
dmap(DistPlan, value_expr(string, ""), head(feed(dot), N)),
head(SeqPlan, N)) :-
transform2DPlan(Plan, DistPlan, SeqPlan),
!.
transform2DPlan(sortby(Plan, Args), DistPlan, sortby(SeqPlan, Args)) :-
transform2DPlan(Plan, DistPlan, SeqPlan),
!.
transform2DPlan(count(Plan),
dmap(DistPlan, value_expr(string, ""), count(feed(dot))),
tie(getValue(seqstart), dot + dotdot)) :-
transform2DPlan(Plan, DistPlan, seqstart),
!.
transform2DPlan(count(Plan), DistPlan, count(SeqPlan)) :-
transform2DPlan(Plan, DistPlan, SeqPlan),
!.
transform2DPlan(Plan, Plan, seqstart) :-
( Plan = dmap(_, _, _)
; Plan = dmap2(_, _, _, _, _)
; Plan = dmap(dbotherobject(_), _, _)
; Plan = dproduct(_, _, _, _, _)
; Plan = areduce2(_, _, _, _, _)
),
!.
/*
---- combinePlans(+DistributedPlan, +SequentialPlan, -Plan) :-
----
Embed the distributed into the sequential plan including dsummarize if appropriate. Also merge distributed operations whenever possible.
*/
% special case: counting query with tie. No dsummarize.
combinePlans(DistributedPlan, SequentialPlan, Plan) :-
SequentialPlan = tie(_, _), !,
mergeDmaps(DistributedPlan, DistributedPlan2),
substituteSubterm(seqstart, DistributedPlan2, SequentialPlan, Plan).
% general case. Use dsummarize.
combinePlans(DistributedPlan, SequentialPlan, Plan) :-
nl, write('DistributedPlan = '), write(DistributedPlan), nl,
nl, write('SequentialPlan = '), write(SequentialPlan), nl,
mergeDmaps(DistributedPlan, DistributedPlan2),
substituteSubterm(seqstart, dsummarize(DistributedPlan2),
SequentialPlan, Plan),
nl, write('Plan = '), write(Plan), nl.
/*
9 Merging Distributed Operations
Adjacent dmap operations can be merged. To be extended for ~dmap2~, ~areduce~, ~partitionF~.
---- mergeDmaps(+Plan, -Plan2)
----
Descends into terms and merges dmaps if possible.
*/
% final dmaps
% following dmap2
mergeDmaps(
dmap(X, _, OuterPlan),
dmap2(X1, Y1, value_expr(string, ""), Plan, FS))
:-
% write('second dmap, 1'), nl,
mergeDmaps(X, dmap2(X1, Y1, _, InnerPlan, FS)),
% write('second dmap, 2'), nl,
substituteSubterm(feed(dot), InnerPlan, OuterPlan, Plan),
% write('second dmap, 3'), nl,
!.
% following dmap
mergeDmaps(
dmap(X, _, OuterPlan),
dmap(X1, value_expr(string, ""), Plan))
:-
% write('first dmap, 1'), write(' '), write('X = '), write(X), nl,
mergeDmaps(X, dmap(X1, _, InnerPlan)),
% write('first dmap, 2'), nl, write(' '), write('X1 = '), write(X1), nl,
substituteSubterm(feed(dot), InnerPlan, OuterPlan, Plan),
% write('first dmap, 3'), nl,
!.
% dmap2, following dmap
% dmap2 for index access
mergeDmaps(
dmap(dmap2(dbdistindexobject(Index), Rel, _, InnerPlan, FileServer), _,
OuterPlan),
dmap2(dbdistindexobject(Index), Rel, value_expr(string, ""), Plan,
FileServer)) :-
substituteSubterm(feed(dot), InnerPlan, OuterPlan, Plan),
!.
% dmap2 for joins
mergeDmaps(
dmap(dmap2(X, Y, _, InnerPlan, FileServer), _, OuterPlan),
dmap2(X1, Y1, value_expr(string, ""), Plan, FileServer)) :-
mergeDmaps(
dmap2(X, Y, _, InnerPlan, FileServer),
dmap2(X1, Y1, _, InnerPlan1, FileServer)),
substituteSubterm(feed(dot), InnerPlan1, OuterPlan, Plan),
!.
% dmap2, preceding dmaps and other operations
% dmap2 for index access
mergeDmaps(
dmap2(dbdistindexobject(Index), Rel, _, Plan, FileServer),
dmap2(dbdistindexobject(Index), Rel, value_expr(string, ""), Plan,
FileServer)) :-
!.
% dmap2 for joins. Each argument can be dmap or something else, e.g. collect2
% or simply an argument.
mergeDmaps(
dmap2(X, Y, _, OuterPlan, FileServer),
dmap2(XArg, YArg, value_expr(string, ""), Plan2, FileServer) )
:-
mergeDmaps(X, XPlan),
( XPlan = dmap(X1, _, InnerPlanX)
-> substituteSubterm(feed(dot), InnerPlanX, OuterPlan, Plan1),
XArg = X1
; XArg = XPlan, Plan1 = OuterPlan
),
mergeDmaps(Y, YPlan),
( YPlan = dmap(Y1, _, InnerPlanY)
-> substituteSubterm(feed(dot), feed(dotdot), InnerPlanY, InnerPlanY2),
substituteSubterm(feed(dotdot), InnerPlanY2, Plan1, Plan2),
YArg = Y1
; YArg = YPlan, Plan2 = Plan1
),
!.
% dproduct, preceding dmaps
mergeDmaps(
dproduct(X, Y, _, OuterPlan, FileServer),
dproduct(X1, Z, value_expr(string, ""), Plan, FileServer)) :-
mergeDmaps(X, dmap(X1, _, InnerPlan)),
mergeDmaps(Y, Z),
substituteSubterm(feed(dot), InnerPlan, OuterPlan, Plan),
!.
% dproduct, following dmaps
mergeDmaps(
dmap(dproduct(X, Y, _, InnerPlan, FileServer), _, OuterPlan),
dproduct(X1, Y1, value_expr(string, ""), Plan, FileServer)) :-
mergeDmaps(
dproduct(X, Y, _, InnerPlan, FileServer),
dproduct(X1, Y1, _, InnerPlan1, FileServer)),
substituteSubterm(feed(dot), InnerPlan1, OuterPlan, Plan),
!.
% areduce2 + partitionF, preceding dmaps
mergeDmaps(
areduce2(
partitionF(X, _, InnerPlanX, Func, FS),
partitionF(Y, _, InnerPlanY, Func, FS),
_, Outerplan, N),
areduce2(
partitionF(X1, value_expr(string, ""), InnerPlanX1, Func, FS),
partitionF(Y1, value_expr(string, ""), InnerPlanY1, Func, FS),
value_expr(string, ""), Outerplan, N)) :-
write('areduce2 0'), nl,
mergeDmaps(
partitionF(X, _, InnerPlanX, Func, FS),
partitionF(X1, _, InnerPlanX1, Func, FS)),
write('areduce2 1'), nl,
mergeDmaps(
partitionF(Y, value_expr(string, ""), InnerPlanY, Func, FS),
partitionF(Y1, value_expr(string, ""), InnerPlanY1, Func, FS)),
write('areduce2 2'), nl,
!.
% areduce, following dmaps
mergeDmaps(
dmap(areduce2(X, Y, S, InnerPlan, FS), S, OuterPlan),
areduce2(X1, Y1, S, Plan, FS))
:-
mergeDmaps(
areduce2(X, Y, S, InnerPlan, FS),
areduce2(X1, Y1, S, InnerPlan1, FS)),
substituteSubterm(feed(dot), InnerPlan1, OuterPlan, Plan),
!.
% partitionF, preceding dmaps
mergeDmaps(
partitionF(dmap(X, S, InnerPlan), S, OuterPlan, Func, FS),
partitionF(X1, S, Plan, Func, FS))
:-
mergeDmaps(dmap(X, S, InnerPlan), dmap(X1, S, InnerPlan1)),
substituteSubterm(feed(dot), InnerPlan1, OuterPlan, Plan),
!.
% collect2
mergeDmaps(
collect2(X, _, N),
collect2(X1, value_expr(string, ""), N)) :-
mergeDmaps(X, X1),
!.
mergeDmaps(Plan, Plan) :-
( Plan = dmap(dbotherobject(_), _, _)
; Plan = dmap2(_, _, _, _, _) % this case to be improved
; Plan = dproduct(_, _, _, _, _)
),
!.
mergeDmaps(Plan, Plan).
/*
Yet to be done:
* Merge a preceding dmap on first or second argument into dmap2.
* Merge a preceding dmap on first argument only (!) into dproduct (because second argument is moved before execution starts and could be reduced in dmap before movement).
*/
/*
10 Check for Distributed Queries
Checks if all relations are distributed. Currently the
optimizer can only handle queries including relations, that
are all local or distributed. Situations with mixed
relation types will be discarded.
*/
%handle not distributed queries
checkDistributedQuery :-
not(isDistributedQuery),
isLocalQuery,
!.
checkDistributedQuery :-
isDistributedQuery,
not(isLocalQuery),
!.
checkDistributedQuery :-
write('Error in query: not all relations distributed '),
fail,
!.
/*
11 Check the Spelling of Non-Relation Objects
*/
spelledObj(Term, Obj, Type, l) :-
downcase_atom(Term, DcObj),
objectCatalog(DcObj, LcObj, Type),
LcObj = lc(Obj),
!.
spelledObj(Term, Obj, Type, u) :-
downcase_atom(Term, DcObj),
objectCatalog(DcObj, Obj, Type),
!.
spelledObj(_, _, _, _) :- !, fail. % no entry, avoid backtracking.
/*
12 Auxiliary Predicates
*/
%fapra 15/16
% Extract parts from a query
destructureQuery(Select from Rel where Pred, Select, Rel, Pred).
% Pred is a predicate about the value of an attribute being equal to given value
attrValueEqualityPredicate(Pred, Value, Attr, Rel) :-
Pred = pr(Value = Attr, Rel),
Attr = attr(_, _, _).
attrValueEqualityPredicate(Pred, Value, Attr, Rel) :-
Pred = pr(Attr = Value, Rel),
Attr = attr(_, _, _).
/*
---- substituteSubterm(Substituted, Substitute, OriginalTerm, TermWithSubstitution)
----
Substituting ~Substituted~ for ~Substitute~ on ~OriginalTerm~ yields ~TermWithSubstitution~. We have a cut in every clause to remove unnecessary choice points
during the search for planedges, which ois driven by meta predicates.
*/
% The whole term is to be substituted:
substituteSubterm(Substituted, Substitute, Substituted, Substitute):- !.
% The whole term doesn't match and it's not compound:
substituteSubterm(Substituted, _, OriginalTerm, OriginalTerm) :-
functor(OriginalTerm, _, 0),
OriginalTerm \= Substituted, !.
% The whole term doesn't match and it's compount - dive into its subterms:
substituteSubterm(Substituted, Substitute, OriginalTerm,
TermWithSubstitution) :-
functor(OriginalTerm, Functor, Arity),
functor(TermWithSubstitution, Functor, Arity),
substituteSubtermInNthSubterm(Arity, Substituted,
Substitute, OriginalTerm, TermWithSubstitution), !.
% Terminal case. All subterms have been processed.
substituteSubtermInNthSubterm(0, _, _, _, _):- !.
% Generic case. Process nth subterm.
substituteSubtermInNthSubterm(N, Substituted, Substitute,
OriginalTerm, TermWithSubstitution) :-
not(N = 0),
arg(N, OriginalTerm, OriginalNthTerm),
substituteSubterm(Substituted, Substitute,
OriginalNthTerm, NthTermWithSubstitution),
arg(N, TermWithSubstitution, NthTermWithSubstitution),
Next is N - 1,
substituteSubtermInNthSubterm(Next, Substituted,
Substitute, OriginalTerm, TermWithSubstitution), !.
/*
Rename an attribute to match the renaming of its relation.
*/
% No renaming needed.
renamedRelAttr(RelAttr, Var, RelAttr) :-
Var = *, !.
renamedRelAttr(attr(Name, N, C), Var, attr(Var:Name, N, C)).
renamedRelAttr2(Arg, RelAttr, Var, RelAttr) :-
( Var = * ; Arg = arg(_) ),
!.
renamedRelAttr2(_, attr(Name, N, C), Var, attr(Var:Name, N, C)).
renamedRelAttr2(_, attr2(Name, N, C), Var, attr2(Var:Name, N, C)).
% Extract the down case name from an attr term.
attrnameDCAtom(Attr, DCAttrName) :-
Attr = attr(_:Name, _, _),
!,
atom_string(AName, Name),
downcase_atom(AName, DCAttrName).
attrnameDCAtom(Attr, DCAttrName) :-
Attr = attr(Name, _, _),
atom_string(AName, Name),
downcase_atom(AName, DCAttrName).
unrenamedAttr(attr(_:Name, N, C), attr(Name, N, C)) :-
!.
unrenamedAttr(attr(Name, N, C), attr(Name, N, C)).
%B.Huber
%Split rel(Relation,Var) to Relation and Var
splitRelationGetVar(rel(Rel, Var), RelOut, VarOut) :-
RelOut = Rel,
VarOut = Var.
%B.Huber end
/*
Rename a tuple a stream.
*/
% No renaming needed.
renameStream(Stream, Var, Plan) :-
Var = *,
!,
Plan = Stream.
renameStream(Stream, Var, rename(Stream, Var)).
/*
Transform a relation to a tuple stream and rename it.
*/
% No renaming needed.
feedRenameRelation(Rel, Var, Plan) :-
Var = *,
!,
Plan = feed(Rel).
feedRenameRelation(Rel, Var, Plan) :-
Plan = rename(feed(Rel), Var).
feedRenameRelation(rel(Rel, Var), Plan) :-
feedRenameRelation(Rel, Var, Plan),!.
feedRenameRelation2(Arg, Rel, Var, Plan) :-
Arg = arg(_),
Var \= *,
Plan = rename(feed(Rel), Var),
!.
feedRenameRelation2(_, Rel, _, feed(Rel)).
/*
12 Extensions to File ~database.pl~
12.1 Auxiliary Predicates
*/
[library(apply)].
:-
dynamic(isDistributedQuery/0),
dynamic(isLocalQuery/0).
/*
Strip a string off its opening and closing quote.
*/
stringWithoutQuotes(Str, StrQuoteless) :-
% string_to_atom(Str, StrAtom),
atom_string(StrAtom, Str),
string_concat(X, '\"', StrAtom),
% string_to_atom(X, XAtom),
atom_string(XAtom, X),
string_concat('\"', StrQuoteless , XAtom).
stringWithoutQuotes(Str, Str) :-
not(string(Str)),!.
/*
Removes the suffix '\_d' from ~DRel~ indicating a distributed relation. If the
relation is not listed in SEC2DISTRIBUTED the unchanged name is returned in
Variable ~ORel~
*/
removeDistributedSuffix(DRel as _, ORel) :-
removeDistributedSuffix(DRel, ORel),!.
removeDistributedSuffix(DRel, ORel) :-
atom(DRel),
atom_concat(X,'_d', DRel),
atom_string(ORel, X),
isDistributedRelation(rel(ORel, _)),!,
assertOnce(isDistributedQuery).
removeDistributedSuffix(ORel, DRel) :-
ORel = DRel,
!,
assertOnce(isLocalQuery).
/*
Ensure to assert a fact only once.
*/
assertOnce(Fact) :-
not(Fact),!,
assert(Fact).
assertOnce(_).
/*
12.2 Creating a list of database objects.
We assume that the object name starts with an capital
letter. If not an lc()- functor indicates that the
initial letter is written in lower case. The rest of the
identifier is written mixed case.
*/
:-
dynamic(storedObject/3).
objectCatalog(DcObj, Obj, Type) :-
storedObject(DcObj, Obj, Type),
!.
objectCatalog(DcObj, LcObj, Type) :-
getSecondoList(ObjList),
member(['OBJECT',Obj,_,[Type|_]], ObjList),
downcase_atom(Obj, DcObj),
is_lowerfl(Obj),
LcObj = lc(Obj),
assert(storedObject(DcObj, LcObj, Type)),
!.
objectCatalog(DcObj, FlObj, Type) :-
getSecondoList(ObjList),
member(['OBJECT',Obj,_,[Type|_]], ObjList),
downcase_atom(Obj, DcObj),
not(is_lowerfl(Obj)),
lowerfl(Obj,FlObj),
assert(storedObject(DcObj, FlObj, Type)),
!.
/*
12.3 Reading the catalogue of distributed relations
Get metainformation about the distributed relations in this db.
Use distributedRels/7 predicate in conjuction with isDistributedQuery
to cover special cases for distributed queries.
*/
:-
dynamic(storedDistributedRelation/7),
dynamic(onlineWorker/3).
% distributedRels(Rel, Obj, DistObjType, NSlots, PartType, DistAttr) :-
% distributedRels(Rel, Obj, DistObjType, NSlots, PartType, DistAttr, _).
distributedRels(rel(Rel, Var), ObjName, DistObjType, NSlots,
PartType, DistAttr, DistParam) :-
storedDistributedRelation(_, _, _, _, _, _, _),
ground(Var), !,% first argument instantiated - but do not match against Var
storedDistributedRelation(rel(Rel, _), ObjName,
DistObjType, NSlots, PartType, DistAttr, DistParam).
distributedRels(rel(Rel, Var), ObjName, DistObjType, NSlots,
PartType, DistAttr, DistParam) :-
storedDistributedRelation(_, _, _, _, _, _, _), !,
storedDistributedRelation(rel(Rel, Var), ObjName,
DistObjType, NSlots, PartType, DistAttr, DistParam).
distributedRels(rel(Rel, Var), ObjName, DistObjType, NSlots,
PartType, DistAttr, DistParam) :-
not(storedDistributedRelation(_, _, _, _, _, _, _)),
ground(Var), !,% first argument instantiated - but do not match against Var
queryDistributedRels,!,
storedDistributedRelation(rel(Rel, _), ObjName,
DistObjType, NSlots, PartType, DistAttr, DistParam).
distributedRels(rel(Rel,Var), ObjName, DistObjType, NSlots,
PartType, DistAttr, DistParam) :-
not(storedDistributedRelation(_, _, _, _, _, _, _)),
queryDistributedRels,!,
storedDistributedRelation(rel(Rel, Var), ObjName,
DistObjType, NSlots, PartType, DistAttr, DistParam).
%check whether the relation is distributed or not
isDistributedRelation(rel(Rel, _)) :-
distributedRels(rel(Rel,'*'), _, _, _, _, _, _),
!.
/*
Read the values from SEC2DISTRIBUTED relation and store it to a
dynamic predicate.
Values of string attributes are passed to us as atoms with an
opening and closing quote and have to be stripped off these.
*/
storeDistributedRels([]).
storeDistributedRels([[RelName, ObjName, DistObjType, NSlots,
PartType, DistAttr, DistParam]|T]) :-
storeDistributedRel(RelName, ObjName, DistObjType, NSlots,
PartType, DistAttr, DistParam),
storeDistributedRels(T).
storeDistributedRel(RelName, ObjName, DistObjType, NSlots,
PartType, DistAttr, DistParam) :-
downcase_atom(RelName, DCRelName),
downcase_atom(ObjName, DCObjName),
assert(storedDistributedRelation(rel(DCRelName, '*'),
dbotherobject(DCObjName),
DistObjType, NSlots, PartType, DistAttr, DistParam)),
!.
storeDistributedRel(_, _, _, _, _) :- !.
spelledDistributedRel(Rel, Rel2, Case) :-
spelled(Rel,Rel2,Case);
(ansi_format([fg(red)], 'Warning: listed object "~w" in SEC2DISTRIBUTED \c
relation does not exist => ignored for further processing \n',[Rel]),
fail),
!.
/*
The availibility of workers related to the distributed relations used
in the current query needs to be checked before creating an execution plan.
We have to distinguish between the type of distribution. Replicated
objects and relations are shared to all workers, available at distribution
time. Therefore it's not possible to backtrack workers involved at this
moment.
Shared relations can be executed even not the complete set of workers are
online, for other distribution types all workers are necessary.
To provide a possibility to test the distributed queries without
executing it on the worker, its possible to disable the connectivity
check by setting the fact 'disableWorkerCheck'
*/
:- dynamic(disableWorkerCheck/0).
%:- assert(disableWorkerCheck).
%check the entries in SEC2DISTRIBUTED
checkOnlineWorkers :-
disableWorkerCheck,!.
checkOnlineWorkers :-
secondo('query SEC2WORKERS',[_,ListOfWorkers]),!,
maplist(maplist(stringWithoutQuotes), ListOfWorkers, StrippedListOfWorkers),
checkOnlineWorker(StrippedListOfWorkers),
!.
%check workers listed in d(f)array
checkOnlineWorkers(_, _) :-
disableWorkerCheck,!.
%first parameter must be a d(f)array
checkOnlineWorkers(_, 'share').
checkOnlineWorkers(ObjName, _) :-
string_concat('query ',ObjName, SecondoQueryStr),
% string_to_atom(SecondoQueryStr,SecondoQuery),
atom_string(SecondoQuery, SecondoQueryStr),
secondo(SecondoQuery,[_, [_,_,ListOfWorkers]]),
checkOnlineWorker(ListOfWorkers),
!.
checkOnlineWorker([]).
checkOnlineWorker([[Host,Port,_]|T]) :-
onlineWorker(Host,Port,_),!,
checkOnlineWorker(T).
checkOnlineWorker([[Host,Port,Config]|T]) :-
format(atom(SecondoQuery),'query connect("~w",~w,"~w")',
[Host,Port,Config]),
secondo(SecondoQuery,[bool, Result]),!,
(Result == true
-> assert(onlineWorker(Host,Port,Config));
cancelOnlineWorkerCheck(Host,Port,Config)),!,
checkOnlineWorker(T).
%worker offline
cancelOnlineWorkerCheck(Host,Port,Config) :-
ansi_format([fg(red)], 'Warning: connection to server \c
host: "~w", port: ~w, config: "~w" failed \n', [Host,Port,Config]),
fail,!.
/*
The system- relation SEC2DISTRIBUTED contains information about
distributed relations in the opened database.
SEC2WORKERS is another necessary relation when using distributed
queries. It contains the available workers in the system.
If necessary the two relations will be created without content.
*/
queryDistributedRels :-
retractall(storedDistributedRelation(_, _, _, _, _, _, _)),
distributedRelsAvailable,
secondo('query SEC2DISTRIBUTED',[_, Tuples]),
!,
maplist(maplist(stringWithoutQuotes), Tuples, ObjList),
storeDistributedRels(ObjList),
!.
distributedRelsAvailable :-
retractall(storedSecondoList(_)),
getSecondoList(ObjList),
( member(['OBJECT','SEC2DISTRIBUTED',_ | [[[_ | [[_ | [_]]]]]]], ObjList) ->
true;
secondo('let SEC2DISTRIBUTED = [const rel(tuple(\c
[RelName: string,\c
ArrayRef: string, \c
DistType: string, \c
NSlots: int, \c
PartType: string, \c
PartAttribute: string, \c
PartParam: string]))value()]',_),
writeln('Created empty SEC2DISTRIBUTED system-relation \n')
),
( member(['OBJECT','SEC2WORKERS',_ | [[[_ | [[_ | [_]]]]]]], ObjList) ->
true;
secondo('let SEC2WORKERS = [const rel(tuple(\c
[Host: string,\c
Port: int, \c
Config: string]))value()]',_),
writeln('Created empty SEC2WORKERS system-relation \n')
),
( member(['OBJECT','SEC2DISTINDEXES',_ | [[[_ | [[_ | [_]]]]]]], ObjList) ->
true;
secondo('let SEC2DISTINDEXES = [const rel(tuple(\c
[DistObj: string,\c
Attr: string, \c
IndexType: string, \c
IndexObj: string]))value()]',_),
writeln('Created empty SEC2DISTINDEXES system-relation \n')
),
!.
distributedRelsAvailable :-
writeln('no open database').
% switch to dynamic predicate sometime in the future
distributedIndex(dbotherobject(DistRelObj), DCAttr, IndexType,
dbdistindexobject(IndexObj)) :-
distributedIndex2(DowncaseAtomTuples),
member([DistRelObj, DCAttr, IndexType, IndexObj], DowncaseAtomTuples).
distributedIndex2(DowncaseAtomTuples) :-
secondo('query SEC2DISTINDEXES',[_, Tuples]), !,
maplist(maplist(stringWithoutQuotes), Tuples, StrippedTuples),
maplist(maplist(atom_string), AtomTuples, StrippedTuples),
maplist(maplist(downcase_atom), AtomTuples, DowncaseAtomTuples),
!.
/*
13 Extensions to File ~operators.pl~
Some constants for cost functions.
*/
dloopTC(1.3).
dsummarizeTC(28).
dmapTC(0.12).
dmap2TC(0.07).
dloop2TC(0.2).
shareTC(447).
getValueTC(14.4).
partitionTC(11.8).
partitionFTC(8).
areduceTC(3.1).
collect2TC(0.9).
tieTC(0.006).
bboxTC(0.002).
gridintersectsTC(0.99).
cellnumberTC(0.07).
hashvalueTC(0.001).
rangeTC(10). % copied from leftrange
% windowintersectsTC(0.1). %costs need to be evaluated, taken from optimizer
itSpatialJoinTC(20.0, 0.7). %costs need to be evaluated, taken from optimizer
extendstreamTC(0.0). %costs need to be evaluated
areduce2TC(0.0). %costs need to be evaluated
%B.Huber
/*
14 Constants for costs2014 functions for distributed commands.
*/
dmapC(0.008771).
dmap2C(0.000000157).
partitionFC(0.00124755).
collect2C(0.0018232).
areduce2C(0.0083229).
dproductC(0.00005911).
%B.Huber end