Files
secondo/Algebras/CDACSpatialJoin/CDACSpatialJoin.h
2026-01-23 17:03:45 +08:00

396 lines
14 KiB
C++

/*
----
This file is part of SECONDO.
Copyright (C) 2019,
Faculty of Mathematics and Computer Science,
Database Systems for New Applications.
SECONDO is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
SECONDO is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SECONDO; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
----
//[<] [\ensuremath{<}]
//[>] [\ensuremath{>}]
\setcounter{tocdepth}{2}
\tableofcontents
1 CDACSpatialJoin operator
The ~cdacspatialjoin~ operator performs a cache-oriented spatial join on
two streams of tuples or tuple blocks, using a divide-and-conquer strategy.
As arguments, ~cdacspatialjoin~ expects two streams of tuples or tuple blocks.
Optionally, the name of the join attributes for each argument relation can be
specified. If these attribute names are omitted, the first attribute with a
suitable spatial kind is used. The operator returns a stream of tuple blocks.
The algorithm is based on Ralf Hartmut Gueting, Werner Schilling: A
practical divide-and-conquer algorithm for the rectangle intersection problem.
Inf. Sci. 42(2): 95-112 (1987). While this paper describes the self join case,
CDACSpatialJoin reports intersecting rectangles from two different rectangle
sets (streams) A and B.
*/
#pragma once
#include <memory>
#include "JoinState.h" // -> Timer;
// -> ... -> InputStream
// -> "Stream.h",
// -> "Algebras/CRel/TBlock.h"
#include "Operator.h"
#include "QueryProcessor.h"
#include "Algebras/CRel/TypeConstructors/TBlockTC.h"
namespace cdacspatialjoin {
class CDACSpatialJoin {
private:
/* the number of input streams (always 2, used for semantic clarity only) */
static constexpr unsigned STREAM_COUNT = 2;
/* the maximum number of args provided type mapping */
static constexpr unsigned MAX_ARG_COUNT = 2 * STREAM_COUNT + 1;
/* the TBlock size in MiB used to create TBlocks in InputTupleStreams
* (when the input is tuples but must be converted to TBlocks) */
static uint64_t DEFAULT_INPUT_BLOCK_SIZE_MIB;
public:
explicit CDACSpatialJoin() = default;
~CDACSpatialJoin() = default;
std::shared_ptr<Operator> getOperator();
private:
class Info;
static ListExpr typeMapping(ListExpr args);
static ListExpr typeMapping2(bool countOnly, ListExpr args);
static CRelAlgebra::TBlockTI getTBlockTI(ListExpr attributeList,
uint64_t desiredBlockSize, ListExpr& tBlockColumns);
static int valueMapping(Word* args, Word& result, int message,
Word& local, Supplier s);
static InputStream* createInputStream(OutputType outputType, Word* args,
unsigned streamIndex);
friend class CDACSpatialJoinCount;
};
/*
2 CDACSpatialJoinCount operator
The ~CDACSpatialJoinCount~ operator specializes the CDACSpatialJoin operator
for cases in which only the number of result tuples is required, but not the
result tuples themselves.
For ~CDACSpatialJoinCount~, only the bounding boxes of the respective join
attributes need to be kept in memory (for which purpose the RectangleBlock
class is being used), while all other tuple data can be discarded from memory,
allowing for JoinStates to treat rectangle sets of significantly higher
cardinality. Furthermore, the ~CDACSpatialJoinCount~ operator does not
implicitly convert input streams of tuples into tuple blocks (as the
~CDACSpatialJoin~ operator would) and, obviously, does not combine intersecting
tuples to create result tuples.
The four arguments of ~CDACSpatialJoinCount~ match the first four arguments of
the ~CDACSpatialJoin~ operator (i.e. two streams of tuples or tuple blocks and,
optionally, the names of the join attributes). The operator returns the number
of intersections, i.e. the number of tuples that a ~CDACSpatialJoin~ call with
the same arguments would return.
*/
class CDACSpatialJoinCount {
public:
explicit CDACSpatialJoinCount() = default;
~CDACSpatialJoinCount() = default;
std::shared_ptr<Operator> getOperator();
private:
class Info;
static ListExpr typeMapping(ListExpr args);
// for valueMapping, CDACSpatialJoin::valueMapping is used
};
/*
3 MemoryInfo struct
Encapsulates counters for analysing the memory usage of the JoinState instances
created by a CDACLocalInfo instance.
*/
struct MemoryInfo {
unsigned joinStateCount = 0;
/* the maximum number of bytes used by any JoinState for its input data
* (i.e. the TBlocks / RBlocks stored in IOData) */
size_t maxMemInputData = 0;
/* the maximum number of bytes used by any JoinState for its SortEdge
* instances */
size_t maxMemSortEdges = 0;
/* the maximum number of bytes used by any JoinState for its RectangleInfo
* instances */
size_t maxMemRectInfos = 0;
/* the maximum number of bytes used by any JoinState for its JoinEdge
* vector */
size_t maxMemJoinEdges = 0;
/* the maximum number of bytes used by any JoinState for its Merger and
* MergedArea instances */
size_t maxMemMergedAreas = 0;
/* the maximum size in bytes of one chunk of output data (i.e. one output
* TBlock or one chunk of output tuples). The memory for Attribute instances
* that are shared with input data does not count here. */
size_t maxMemOutputDataAddSize = 0;
/* the maximum size in bytes of one chunk of output data (i.e. one output
* TBlock or one chunk of output tuples). The memory for Attribute instances
* that are shared with input data counts here. */
size_t maxMemOutputDataMemSize = 0;
/* the total maximum number of bytes used by any JoinState. Note that this
* is not necessarily the same as the sum of the other maxMem... values,
* since those maximum values may have occurred at different times */
size_t maxMemTotal = 0;
/* the total number of bytes used by all JoinStates for their input data
* (i.e. the TBlocks / RBlocks stored in IOData) */
size_t sumMemInputData = 0;
/* the total number of bytes used by all JoinStates for their SortEdge
* instances */
size_t sumMemSortEdges = 0;
/* the total number of bytes used by all JoinStates for their RectangleInfo
* instances */
size_t sumMemRectInfos = 0;
/* the total number of bytes used by all JoinStates for their JoinEdge
* vector */
size_t sumMemJoinEdges = 0;
/* the total number of bytes used by all JoinStates for their Merger and
* MergedArea instances */
size_t sumMemMergedAreas = 0;
/* the total number of bytes used by all JoinStates for their respective
* largest chunk of output data (i.e. for the largest output TBlock or the
* largest chunk of output tuples). The memory for Attribute instances
* that are shared with input data does not count here. */
size_t sumMemOutputDataAddSizeMax = 0;
/* the total number of bytes used by all JoinStates for their respective
* largest chunk of output data (i.e. for the largest output TBlock or the
* largest chunk of output tuples). The memory for Attribute instances
* that are shared with input data counts here. */
size_t sumMemOutputDataMemSizeMax = 0;
/* the total number of tuples provided by input stream A (in one pass) */
size_t totalInputATupleCount = 0;
/* the total number of bytes provided by input stream A (in one pass) */
size_t totalInputADataSize = 0;
/* the total number of tuples provided by input stream B (in one pass) */
size_t totalInputBTupleCount = 0;
/* the total number of bytes provided by input stream B (in one pass) */
size_t totalInputBDataSize = 0;
/* the total number of output tuples generated by all JoinStates */
size_t totalOutputTupleCount = 0;
/* the total number of bytes generated by all JoinStates for their output
* data (i.e. the TBlocks passed down the stream). The memory for Attribute
* instances that are shared with input data does not count here. */
size_t totalOutputDataAddSize = 0;
/* the total number of bytes generated by all JoinStates for their output
* data (i.e. the TBlocks passed down the stream). The memory for Attribute
* instances that are shared with input data counts here. */
size_t totalOutputDataMemSize = 0;
/* the total number of bytes used by all JoinStates. This equals the sum
* of the other sumMem... values */
size_t sumMemTotal = 0;
double maxJoinEdgeQuota = 0.0;
MemoryInfo() = default;
~MemoryInfo() = default;
void add(const JoinStateMemoryInfo& joinStateInfo);
void setInputSize(size_t totalInputATupleCount_, size_t totalInputADataSize_,
size_t totalInputBTupleCount_, size_t totalInputBDataSize_);
void print(std::ostream& out, OutputType outputType);
private:
void printLineMem(std::ostream& out, const std::string& text,
size_t sumValue, size_t maxValue, const std::string& note,
unsigned cacheLineSize);
static void printLineInOut(std::ostream& out, const std::string& text,
uint64_t bytes, uint64_t tupleCount, const std::string& note);
};
/*
4 LocalInfo class
*/
class CDACLocalInfo {
static unsigned activeInstanceCount;
/* if the desired output type is a tuple stream, the join operation will
* use a vector of tuples to temporarily store some output tuples. This
* value determines how much main memory (in KiB) may be used by these
* output tuples, before the temporary output tuple vector is flushed to the
* stream */
static uint64_t DEFAULT_OUTPUT_TUPLE_VECTOR_MEM_SIZE_KIB;
/* the desired output type: outputCount, if only the number of intersecting
* rectangles should be returned (i.e. the CDACSpatialJoinCount operator was
* called); outputTupleStream, if the result tuples should be returned as
* a stream of tuples; outputTBlockStream, if the result tuples should be
* returned as a stream of tuple blocks (both done by the CDACSpatialJoin
* operator) */
const OutputType outputType;
/* the TupleType of the output tuples (if the desired outputType is a tuple
* stream) */
TupleType* outputTupleType;
/* the first input stream. If neither stream can be kept in the main memory
* at once, inputA is used as the inner loop (i.e. it is closed and
* re-opened several times) */
InputStream* const inputA;
/* the secondo input stream. If neither stream can be kept in the main memory
* at once, inputB is used as the outer loop (i.e. it is traversed
* only once) */
InputStream* const inputB;
const Supplier s;
/* true while no data has been requested from the input streams yet; false
* thereafter */
bool isFirstRequest;
/* the memory limit for this operator */
const uint64_t memLimit;
/* information on the output TBlock type; unused if countOnly == true */
const CRelAlgebra::TBlockTI outTypeInfo;
const CRelAlgebra::PTBlockInfo outTBlockInfo;
/* the size of the output buffer (i.e. the output TBlock or the outTuples
* vector) in bytes */
uint64_t outBufferSize;
/* the additional memory required for each output tuple (if an output tuple
* stream is required). Since the input tuples and their attributes already
* exist in memory when an output tuple is concatenated, the output tuple
* will usually only get pointers to these attributes, so outTupleAddSize
* does not count the attribute sizes */
uint64_t outTupleAddSize;
/* the maximum number of tuples allowed in the output buffer (for
* outputTupleStream) */
uint64_t outBufferTupleCountMax;
/* a number with which different CDACLocalInfo instances can be
* distinguished in console output (e.g. if several CDACSpatialJoin[Count]
* operators are used within one query) */
const unsigned instanceNum;
/* the current JoinState which operates on the data that could be read into
* main memory */
JoinState* joinState;
/* the number of JoinState instances created so far */
unsigned joinStateCount;
/* the number of intersections found so far */
size_t intersectionCount;
/* the current output tuple block (used only if the desired outputType is a
* stream of TBlocks) */
CRelAlgebra::TBlock* outTBlock;
/* the current vector for output tuples (used only if the desired
* outputType is a stream of tuples). This vector serves as a temporary
* store and is flushed once the tuples exceed the memory limit given by
* OUTPUT\_TUPLE\_VECTOR\_MEM\_SIZE\_MIB */
std::vector<Tuple*>* outTuples;
#ifdef CDAC_SPATIAL_JOIN_METRICS
MemoryInfo memoryInfo;
#endif
std::shared_ptr<Timer> timer;
public:
// constructor
CDACLocalInfo(OutputType outputType_, ListExpr outputTupleTypeLE,
InputStream* inputA_, InputStream* inputB_, Supplier s);
// destructor decreases the reference counters for all loaded tuple
// blocks and closes both streams
~CDACLocalInfo();
size_t getIntersectionCount() const { return intersectionCount; }
CRelAlgebra::TBlock* getNextTBlock();
Tuple* getNextTuple();
bool getNext();
private:
void requestInput();
/* Estimates the required amount of main memory, i.e. both the size of the
* two input TBlock / RBlock vectors, and the expected JoinState memory
* usage */
size_t getRequiredMemory() const;
std::string getOperatorName() const;
}; // end class LocalInfo
} // end namespace cdacspatialjoin