Files
secondo/Algebras/CDACSpatialJoin/JoinState.h
2026-01-23 17:03:45 +08:00

395 lines
16 KiB
C++

/*
----
This file is part of SECONDO.
Copyright (C) 2019,
Faculty of Mathematics and Computer Science,
Database Systems for New Applications.
SECONDO is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
SECONDO is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SECONDO; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
----
//[<] [\ensuremath{<}]
//[>] [\ensuremath{>}]
\setcounter{tocdepth}{2}
\tableofcontents
*/
#pragma once
#include "Merger.h" // -> ... -> <memory>, <vector>
// -> MergedArea, IOData
// -> InputStream, SortEdge, RectangleInfo, JoinEdge
// -> ... -> "Algebras/CRel/TBlock.h"
// -> ... -> "Algebras/Rectangle/RectangleAlgebra.h"
#include "SelfMerger.h"
#include "Timer.h"
namespace cdacspatialjoin {
/*
1 JoinTask enumeration
Lists the different tasks to be performed (possibly multiple times each)
during a CDACSpatialJoin or CDACSpatialJoinCount operation. This enum is used
to get distinct Timer evaluation for each task.
*/
enum JoinTask : unsigned {
/* the task of requesting data from the InputStreams */
requestData,
/* the task of creating a vector of SortEdge instances */
createSortEdges,
/* the task of sorting the vector of SortEdge instances */
sortSortEdges,
/* the task of creating a vector of JoinEdge instances */
createJoinEdges,
/* the task of merging the JoinEdges and reporting (or counting) the
* intersections */
merge,
/* the task of saving the input data of the "inner" stream from main memory
* to a temporary file */
saveToTempFile,
/* the task of clearing the input stream memory (and destructing the
* CDACLocalInfo class) */
clearMemory
};
/*
2 JoinStateMemoryInfo struct
Encapsulates counters for the analysis of a JoinState's memory usage.
*/
struct JoinStateMemoryInfo {
/* the size of the JoinState::joinEdges vector (a copy of
* JoinState::joinEdgesSize) */
size_t joinEdgesSize;
/* the number of bytes used for the input data, i.e. the TBlocks / RBlocks
* stored in IOData, during the whole lifetime of this JoinState */
size_t usedInputDataMemory;
/* the number of bytes used temporarily in the constructor for RectangleInfo
* instances */
size_t usedRectInfoMemory;
/* the number of bytes used temporarily in the constructor for SortEdge
* instances */
size_t usedSortEdgeMemory;
/* the number of bytes used for the JoinEdge vector during the whole
* lifetime of this JoinState */
size_t usedJoinEdgeMemory;
/* the number of bytes currently used for the Merger instance and all
* MergedArea instances stored in mergedAreas[]. The variable is updated
* whenever a mergedAreas[] entry is set */
size_t usedMergedAreaMemory;
/* the maximum number of bytes that occurred in usedMergedAreaMemory
* during the lifetime of this JoinState instance */
size_t usedMergedAreaMemoryMax;
/* the number of of JoinEdge instances currently stored in the
* Merger instance and all MergedArea instances */
size_t mergeJoinEdgeCount;
/* the maximum number of JoinEdge instances that were required in the
* Merger and MergedArea instances at any given time */
size_t mergeJoinEdgeCountMax;
/* the number of output tuples created during the whole lifetime of this
* JoinState */
uint64_t outputTupleCount;
/* the size in bytes of all output data additionally created during the
* whole lifetime of this JoinState (i.e. the size sum of all output
* TBlocks, or the sizeof all output tuples without attributes) */
size_t outputDataAddSize;
/* the size in bytes of all output tuples created during the whole lifetime
* of this JoinState (i.e. the size sum of all output TBlocks, or the
* GetMemSize of all output tuples) */
size_t outputDataMemSize;
/* the maximum outputDataAddSize value of one chunk of output data (i.e.
* one output TBlock or one chunk of output tuples) created during the
* lifetime of this JoinState */
size_t outputDataAddSizeMax;
/* the maximum outputDataMemSize value of one chunk of output data (i.e.
* one output TBlock or one chunk of output tuples) created during the
* lifetime of this JoinState */
size_t outputDataMemSizeMax;
/* initializes the memory statistics with the given information */
void initialize(size_t usedInputDataMemory_, size_t rectangleInfoCount,
size_t sortEdgeCount, size_t joinEdgeCount);
/* adds the memory used by the given MergedArea to the statistics of
* currently used memory */
inline void add(MergedAreaPtr mergedArea);
/* subtracts the memory used by the given MergedArea from the statistics of
* currently used memory */
inline void subtract(MergedAreaPtr mergedArea);
/* calculates the currently used memory (including MergedAreas and the given
* Merger) and possibly increases the maximum values */
inline void updateMaximum(Merger* merger);
/* adds the memory used by the given SelfMergedArea to the statistics of
* currently used memory */
inline void add(SelfMergedAreaPtr mergedArea);
/* subtracts the memory used by the given SelfMergedArea from the statistics
* of currently used memory */
inline void subtract(SelfMergedAreaPtr mergedArea);
/* calculates the currently used memory (including SelfMergedAreas and the
* given SelfMerger) and possibly increases the maximum values */
inline void updateMaximum(SelfMerger* merger);
/* adds the given number of tuples and bytes to the output data. While
* memAddByteCount is the number of bytes that were additionally
* reserved from memory, memSizeByteCount is the whole MemSize of the
* output tuples (including the size of Attribute instances that are shared
* with input tuples) */
inline void addOutputData(uint64_t tupleCount, size_t memAddByteCount,
size_t memSizeByteCount);
/* the maximum number of main memory bytes used at any point during the
* lifetime of this JoinState */
size_t getTotalUsedMemoryMax() const;
/* returns a) the maximum number of JoinEdge instances that were required
* in the Merger and MergedArea instances at any given time, divided by
* b) the number of rectangles from both input streams (excluding those
* that were outside the other input stream's bounding box).
* This value is in [1.0, 2.0] and is useful to estimate the memory required
* for join operations. Samples with wide rectangles will produce a higher
* value than samples with narrow rectangles, since initially, left and
* right edges are stored for a rectangle, but as soon as a rectangle is
* "complete" inside a MergedArea, only one edge (the rectangle's
* Y-interval) is stored */
double getUsedJoinEdgeQuotaMax() const {
return mergeJoinEdgeCountMax / (joinEdgesSize / 2.0);
}
};
/*
3 JoinState class
At initialization, the JoinState class expects data from both input streams.
Its public method nextTBlock() then fills the given outTBlock with result
tuples until either the outTBlock is full, or the operation is complete.
Result tuples are combined from one tuple of each input stream respectively,
where the bounding box of these tuples' GeoData intersect.
If "countOnly == true" is passed in the constructor, intersections are merely
counted, but no output tuples are written to the outTBlock. The result count
can then be retrieved using the getOutTupleCount() getter.
JoinState detects self joins (if the bounding boxes of both inputs are exactly
identical, including the same TBlock positions) and treats this special case
efficiently using specialized methods and classes (SelfMergedArea, SelfMerger).
*/
class JoinState {
/* ioData holds all current data from input both streams and provides
* methods to a) extract the SortEdges and RectangleInfos from it, and
* b) fill the output tuple block with result tuples (for which purpose it
* is passed to Merger instances) */
IOData ioData;
const OutputType outputType;
/* the number of tuples stored in the current TBlocks (for each stream) */
const uint64_t tupleCounts[SET_COUNT];
/* the number of the CDACSpatialJoin[Count] operator (or, more precisely,
* the CDACLocalInfo instance) that created this JoinState instance. This
* number can be used to distinguish several operators in console output */
const std::string outputPrefix;
std::shared_ptr<Timer> timer;
/* true if a self join was detected, i.e. the rectangles in input A and B
* are identical, and the tuples appear in the same order and at the same
* addresses in the TBlocks of input A and B */
bool selfJoin;
// -----------------------------------------------------
// variables storing the current state of the operation, allowing it to
// be interrupted when the outTBlock is full, and to be resumed later
/* stores the left and right edges of each rectangle in both sets, sorted
* by their x position (and then their yMin value) */
std::vector<JoinEdge> joinEdges;
/* equal to joinEdges.size() (but faster to access) */
size_t joinEdgesSize;
/* the index in joinEdges from which the next (Self)MergedArea will be
* created */
EdgeIndex_t joinEdgeIndex;
/* in a self join, each rectangle appears in both input A and input B and
* therefore an intersection must be reported for each rectangle. This is
* done at the beginning of selfJoinNextTBlock(), where this index points
* to the next edge in joinEdges which will be considered */
EdgeIndex_t idJoinEdgeIndex;
static constexpr unsigned MERGED_AREAS_SIZE = 32;
/* stores one MergedArea for each level. A MergedArea is waiting for
* the an adjacent MergedArea with which it can be merged to a bigger
* MergedArea (to be stored on the next level). index 0 represents the
* lowest level; here a MergedArea may only contain JoinEdges from one
* set */
MergedAreaPtr mergedAreas[MERGED_AREAS_SIZE];
/* for a self join, stores one SelfMergedArea for each level. A
* SelfMergedArea is waiting for the an adjacent SelfMergedArea with which
* it can be merged to a bigger SelfMergedArea (to be stored on the next
* level). index 0 represents the lowest level; here a SelfMergedArea may
* only contain one single JoinEdges */
SelfMergedAreaPtr selfMergedAreas[MERGED_AREAS_SIZE];
unsigned levelCount;
/* the level (i.e. index in mergedAreas) at which the next merge operation
* is performed (the result of which is stored one level higher, or
* recursively merged with the entry at that level) */
unsigned mergeLevel;
/* the number of MergedAreas to be created on the lowest level. Note that an
* "atomic" MergedArea is not created from a single JoinEdge but possibly
* from a sequence of JoinEdges that belong to the same set */
unsigned long atomsExpectedTotal;
/* the number of MergedAreas already created on the lowest level */
unsigned long atomsCreated;
/* the average number of atomic MergedAreas that need to be merged before
* the resulting MergedArea can move up from lowest level 0 to level 1.
* This value simulates the "divide" part of divide and conquer and ensures
* that MergedAreas are of approximately equal sizes, avoiding inefficient
* merge operations. If atomsExpectedTotal (the total number of atomic
* MergedAreas) happens to be a power of two, atomsExpectedStep will be
* 2.0 exactly. In any other case, it will be slightly higher (but always
* less than 4.0). */
double atomsExpectedStep;
/* accumulates atomsExpectedStep. At any given moment, the atomsCreated
* value must exceed atomsExpectedNext in order for a resulting MergedArea
* to be moved up to level 1. */
double atomsExpectedNext;
/* the Merger that performs the currently running merge operation;
* nullptr, if no merge operation is currently running */
Merger* merger;
/* in a self join, the SelfMerger that performs the currently running merge
* operation; nullptr, if no merge operation is currently running */
SelfMerger* selfMerger;
// -----------------------------------------------------
// statistical
/* the clock() time at which the initialization of this JoinState instance
* was completed */
clock_t initializeCompleted;
/* the number of (non-empty) outTBlocks returned by this JoinState */
unsigned outTBlockCount;
#ifdef CDAC_SPATIAL_JOIN_REPORT_TO_CONSOLE
/* the number of bytes that were additionally reserved from memory
* for the output tuples created by this JoinState instance. In case of an
* output tuple stream, this may be smaller than the MemSize of the tuples
* since the output tuples point to the same Attribute instances as the
* input tuples */
size_t outputAddSize;
#endif
/* is set to true once the join has completed; the outTBlock may still
* contain the last result tuples */
bool joinCompleted;
#ifdef CDAC_SPATIAL_JOIN_METRICS
JoinStateMemoryInfo memoryInfo;
#endif
public:
/* constructor taking all data required from the input streams:
* attrIndexA/B: the positions of the join attributes;
* tupleCountA/B: the number of tuples stored in the given tBlocks;
* dimA/B: the dimension (2 or 3) of the spatial information;
* outBufferSize: the maximum size of the output buffer in bytes;
* outBufferTupleCountMax: the maximum tuple count in the output buffer
* joinStateId: the consecutive number of this JoinState instance */
JoinState(OutputType outputType_, TupleType* tupleType_,
InputStream* inputA_, InputStream* inputB_,
uint64_t outBufferSize_, uint64_t outTupleAddSize_,
uint64_t outBufferTupleCountMax_,
unsigned operatorNum_, unsigned joinStateId_,
std::shared_ptr<Timer>& timer_);
~JoinState();
/* fills the given outTBlock with result tuples; returns true, if more
* tuples were found, or false, if the operation is complete and no more
* result tuples were found */
bool nextTBlock(CRelAlgebra::TBlock* outTBlock_,
std::vector<Tuple*>* outTuples_);
size_t getOutTupleCount() const { return ioData.getOutTupleCount(); }
#ifdef CDAC_SPATIAL_JOIN_METRICS
const JoinStateMemoryInfo& getMemoryInfo() const { return memoryInfo; }
#endif
private:
/* creates a new Merger for the given areas, then deletes the areas */
inline Merger* createMerger(unsigned levelOfArea1, MergedAreaPtr area2);
#ifdef CDAC_SPATIAL_JOIN_REPORT_TO_CONSOLE
void reportLastMerge(MergedAreaPtr area1, MergedAreaPtr area2) const;
#endif
/* self join version of nextTBlock. Fills the outTBlock in ioData with
* result tuples; returns true, if more tuples were found, or false, if the
* operation is complete and no more result tuples were found */
bool selfJoinNextTBlock();
/* creates a new Merger for the given areas, then deletes the areas */
inline SelfMerger* createSelfMerger(unsigned levelOfArea1,
SelfMergedAreaPtr area2);
#ifdef CDAC_SPATIAL_JOIN_REPORT_TO_CONSOLE
void reportLastMerge(SelfMergedAreaPtr area1, SelfMergedAreaPtr area2) const;
#endif
uint64_t updateStatistics(uint64_t outTupleCountAtStart);
};
} // end namespace