1379 lines
32 KiB
C
1379 lines
32 KiB
C
|
|
/*
|
||
|
|
----
|
||
|
|
This file is part of SECONDO.
|
||
|
|
|
||
|
|
Copyright (C) 2004-2008, University in Hagen, Faculty of Mathematics and
|
||
|
|
Computer Science, Database Systems for New Applications.
|
||
|
|
|
||
|
|
SECONDO is free software; you can redistribute it and/or modify
|
||
|
|
it under the terms of the GNU General Public License as published by
|
||
|
|
the Free Software Foundation; either version 2 of the License, or
|
||
|
|
(at your option) any later version.
|
||
|
|
|
||
|
|
SECONDO is distributed in the hope that it will be useful,
|
||
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
|
GNU General Public License for more details.
|
||
|
|
|
||
|
|
You should have received a copy of the GNU General Public License
|
||
|
|
along with SECONDO; if not, write to the Free Software
|
||
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||
|
|
----
|
||
|
|
|
||
|
|
[1] Definition of Auxiliary Classes in HadoopParallelAlgebra
|
||
|
|
|
||
|
|
April 2010 Jiamin Lu
|
||
|
|
|
||
|
|
[newpage]
|
||
|
|
|
||
|
|
1 Auxiliary Classes in HadoopParallelAlgebra
|
||
|
|
|
||
|
|
This file claims all relevant classes and methods that is
|
||
|
|
used in HadoopParallelAlgebra. Includes follow classes:
|
||
|
|
|
||
|
|
* ~deLocalInfo~. Assists ~doubleexport~ operator.
|
||
|
|
|
||
|
|
* ~phjLocalInfo~. Assists ~parahashjoin~ operator.
|
||
|
|
|
||
|
|
And includes one method:
|
||
|
|
|
||
|
|
* ~renameList~. Assists ~parahashjoin~ operator
|
||
|
|
|
||
|
|
*/
|
||
|
|
#ifndef HADOOPPARALLELALGEBRA_H_
|
||
|
|
#define HADOOPPARALLELALGEBRA_H_
|
||
|
|
|
||
|
|
#define MAX_WAITING_TIME 10
|
||
|
|
|
||
|
|
#include "RTuple.h"
|
||
|
|
#include "Algebras/FText/FTextAlgebra.h"
|
||
|
|
#include "Base64.h"
|
||
|
|
#ifdef SECONDO_WIN32
|
||
|
|
#include <winsock2.h>
|
||
|
|
#endif
|
||
|
|
#include <sys/socket.h>
|
||
|
|
#include "TupleQueueHP.h"
|
||
|
|
|
||
|
|
#include "FileSystem.h"
|
||
|
|
#include "Profiles.h"
|
||
|
|
#include "Progress.h"
|
||
|
|
#include "Algebras/Temporal/TemporalAlgebra.h"
|
||
|
|
#include <ifaddrs.h>
|
||
|
|
#include <arpa/inet.h>
|
||
|
|
#include <pthread.h>
|
||
|
|
#include <signal.h>
|
||
|
|
|
||
|
|
|
||
|
|
const int MAX_COPYTIMES = 50;
|
||
|
|
const size_t MAX_OPENFILE_NUM = 100;
|
||
|
|
const std::string dbLoc = "<READ DB/>";
|
||
|
|
|
||
|
|
class fList;
|
||
|
|
|
||
|
|
//Uses Function from ArrayAlgebra
|
||
|
|
namespace arrayalgebra{
|
||
|
|
void extractIds(const ListExpr,int&,int&);
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
std::string tranStr(const std::string& s, const std::string& from,
|
||
|
|
const std::string& to);
|
||
|
|
std::string getLocalFilePath(std::string path, const std::string fileName,
|
||
|
|
std::string suffix, bool appendFileName = true);
|
||
|
|
std::string addFileIndex(std::string fileName, int index);
|
||
|
|
ListExpr AntiNumericType(ListExpr numericType);
|
||
|
|
int copyFile(std::string source, std::string dest, bool cfn = false);
|
||
|
|
Tuple* readTupleFromFile(std::ifstream* file, TupleType* type, int mode,
|
||
|
|
std::string flobFile = "");
|
||
|
|
int getRoundRobinIndex(int row, int clusterSize);
|
||
|
|
ListExpr rmTermNL(ListExpr list, std::string term, int& count);
|
||
|
|
NList addIncomplete(const NList& attrList);
|
||
|
|
|
||
|
|
|
||
|
|
//Set up the remote copy command options uniformly.
|
||
|
|
const std::string scpCommand = "scp -q ";
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.1 deLocalInfo Class
|
||
|
|
|
||
|
|
Assists ~doubleexport~ operator.
|
||
|
|
First traverse tuples from streamA to the end, then traverse streamB.
|
||
|
|
|
||
|
|
*/
|
||
|
|
class deLocalInfo
|
||
|
|
{
|
||
|
|
private:
|
||
|
|
|
||
|
|
Word streamA;
|
||
|
|
Word streamB;
|
||
|
|
bool isAEnd;
|
||
|
|
//Check out whether gets to the end of streamA.
|
||
|
|
|
||
|
|
int attrIndexA;
|
||
|
|
int attrIndexB;
|
||
|
|
|
||
|
|
TupleType *resultTupleType;
|
||
|
|
|
||
|
|
Tuple* makeTuple(Word stream, int index, int sig);
|
||
|
|
|
||
|
|
public:
|
||
|
|
deLocalInfo(Word _streamA, Word wAttrIndexA,
|
||
|
|
Word _streamB, Word wAttrIndexB,
|
||
|
|
Supplier s);
|
||
|
|
|
||
|
|
~deLocalInfo()
|
||
|
|
{
|
||
|
|
//delete resultTupleType;
|
||
|
|
if (resultTupleType != 0)
|
||
|
|
resultTupleType->DeleteIfAllowed();
|
||
|
|
}
|
||
|
|
|
||
|
|
Tuple* nextResultTuple();
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.2 renameList Method
|
||
|
|
|
||
|
|
Assists ~parahashjoin~ operator.
|
||
|
|
Append an ~appendName~ after each attribute name of ~oldTupleList~,
|
||
|
|
to avoid the situation that joined relations have a same attribute name.
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr renameList(ListExpr oldTupleList, std::string appendName);
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.3 binEncode Method
|
||
|
|
|
||
|
|
Assist ~doubleexport~, ~add0Tuple~ operator.
|
||
|
|
Transform a nestedList into a transportable Base 64 binary string
|
||
|
|
|
||
|
|
*/
|
||
|
|
std::string binEncode(ListExpr nestList);
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.4 binDecode Method
|
||
|
|
|
||
|
|
Assist ~parahashjoin~, ~parajoin~ opertor.
|
||
|
|
To decode a Base 64 binary string back to nestedList efficiently.
|
||
|
|
|
||
|
|
*/
|
||
|
|
ListExpr binDecode(std::string binStr);
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.3 phjLocalInfo Class
|
||
|
|
|
||
|
|
Assists ~parahashjoin~ operator.
|
||
|
|
~getNewProducts~ collects tuples in each buckets,
|
||
|
|
and make products if the tuples come from different sources.
|
||
|
|
Put the product results in the TupleBuffer ~joinedTuples~.
|
||
|
|
|
||
|
|
|
||
|
|
*/
|
||
|
|
class phjLocalInfo
|
||
|
|
{
|
||
|
|
private:
|
||
|
|
|
||
|
|
Word mixStream;
|
||
|
|
TupleType *resultTupleType;
|
||
|
|
TupleType *tupleTypeA, *tupleTypeB;
|
||
|
|
|
||
|
|
TupleBuffer *joinedTuples;
|
||
|
|
GenericRelationIterator *tupleIterator;
|
||
|
|
|
||
|
|
GenericRelationIterator* getNewProducts( Supplier s);
|
||
|
|
|
||
|
|
public:
|
||
|
|
phjLocalInfo(Word _stream, Supplier s,
|
||
|
|
ListExpr ttA, ListExpr ttB);
|
||
|
|
|
||
|
|
~phjLocalInfo()
|
||
|
|
{
|
||
|
|
if (joinedTuples != 0)
|
||
|
|
delete joinedTuples;
|
||
|
|
joinedTuples = 0;
|
||
|
|
|
||
|
|
if (resultTupleType != 0)
|
||
|
|
resultTupleType->DeleteIfAllowed();
|
||
|
|
resultTupleType = 0;
|
||
|
|
if (tupleTypeA != 0)
|
||
|
|
tupleTypeA->DeleteIfAllowed();
|
||
|
|
tupleTypeA = 0;
|
||
|
|
if (tupleTypeB != 0)
|
||
|
|
tupleTypeB->DeleteIfAllowed();
|
||
|
|
tupleTypeB = 0;
|
||
|
|
|
||
|
|
if (tupleIterator != 0)
|
||
|
|
delete tupleIterator;
|
||
|
|
tupleIterator = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
Word nextJoinTuple( Supplier s );
|
||
|
|
};
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.4 pjLocalInfo Class
|
||
|
|
|
||
|
|
Assists ~parajoin~ operator.
|
||
|
|
|
||
|
|
*/
|
||
|
|
typedef enum { tupBufferA, tupBufferB } tupleBufferType;
|
||
|
|
|
||
|
|
class pjLocalInfo
|
||
|
|
{
|
||
|
|
private:
|
||
|
|
|
||
|
|
Word mixedStream;
|
||
|
|
Supplier JNfun;
|
||
|
|
TupleType *tupleTypeA, *tupleTypeB;
|
||
|
|
|
||
|
|
TupleBuffer *tbA;
|
||
|
|
GenericRelationIterator *itrA;
|
||
|
|
TupleBuffer *tbB;
|
||
|
|
GenericRelationIterator *itrB;
|
||
|
|
const int maxMem;
|
||
|
|
bool endOfStream;
|
||
|
|
bool isBufferFilled;
|
||
|
|
|
||
|
|
//Get the tuples within one bucket,
|
||
|
|
//and fill them into tupleBuffers
|
||
|
|
void loadTuples();
|
||
|
|
public:
|
||
|
|
pjLocalInfo(Word inputStream, Supplier fun, Supplier s,
|
||
|
|
ListExpr ttA, ListExpr ttB,
|
||
|
|
int mem) :
|
||
|
|
mixedStream(inputStream),
|
||
|
|
tbA(0),
|
||
|
|
itrA(0),
|
||
|
|
tbB(0),
|
||
|
|
itrB(0),
|
||
|
|
maxMem(mem),
|
||
|
|
endOfStream(false),
|
||
|
|
isBufferFilled(false)
|
||
|
|
{
|
||
|
|
//Indicator both arguments of the function accept stream
|
||
|
|
JNfun = fun;
|
||
|
|
qp->SetupStreamArg(JNfun, 1, s);
|
||
|
|
qp->SetupStreamArg(JNfun, 2, s);
|
||
|
|
qp->Open(JNfun);
|
||
|
|
|
||
|
|
tupleTypeA = new TupleType(ttA);
|
||
|
|
tupleTypeB = new TupleType(ttB);
|
||
|
|
}
|
||
|
|
|
||
|
|
~pjLocalInfo()
|
||
|
|
{
|
||
|
|
if (tbA != 0)
|
||
|
|
delete tbA;
|
||
|
|
tbA = 0;
|
||
|
|
if (tbB != 0)
|
||
|
|
delete tbB;
|
||
|
|
tbB = 0;
|
||
|
|
|
||
|
|
if (tupleTypeA != 0)
|
||
|
|
tupleTypeA->DeleteIfAllowed();
|
||
|
|
tupleTypeA = 0;
|
||
|
|
if (tupleTypeB != 0)
|
||
|
|
tupleTypeB->DeleteIfAllowed();
|
||
|
|
tupleTypeB = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Get the next tuple from tupleBuffer A or tuppleBuffer B.
|
||
|
|
Tuple* getNextInputTuple(tupleBufferType tbt);
|
||
|
|
|
||
|
|
// Get the next result tuple
|
||
|
|
void* getNextTuple();
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.5 a0tLocalInfo
|
||
|
|
|
||
|
|
Assists ~add0Tuple~ operator.
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct a0tLocalInfo
|
||
|
|
{
|
||
|
|
std::string key;
|
||
|
|
TupleType *resultTupleType;
|
||
|
|
Tuple *cachedTuple;
|
||
|
|
bool needInsert;
|
||
|
|
std::string sepTupleStr;
|
||
|
|
|
||
|
|
inline a0tLocalInfo(ListExpr rtNL):
|
||
|
|
key(""), cachedTuple(0),
|
||
|
|
needInsert(false)
|
||
|
|
{
|
||
|
|
resultTupleType = new TupleType(nl->Second(rtNL));
|
||
|
|
}
|
||
|
|
|
||
|
|
inline ~a0tLocalInfo()
|
||
|
|
{
|
||
|
|
if (resultTupleType != 0)
|
||
|
|
resultTupleType->DeleteIfAllowed();
|
||
|
|
resultTupleType = 0;
|
||
|
|
|
||
|
|
if ( cachedTuple != 0)
|
||
|
|
cachedTuple->DeleteIfAllowed();
|
||
|
|
cachedTuple = 0;
|
||
|
|
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.6 clusterInfo
|
||
|
|
|
||
|
|
We define two list files in all nodes involved in a parallel Secondo.
|
||
|
|
These files list all nodes' ip addresses, cell file locations,
|
||
|
|
and Secondo Monitor's access ports.
|
||
|
|
|
||
|
|
These two files are separated to master and slaves list.
|
||
|
|
The master list only have one line, and the slave list have lines of
|
||
|
|
as same as the disks of the slaves.
|
||
|
|
These two files are specified in nodes, by setting their
|
||
|
|
PARALLEL\_SECONDO\_MASTER and PARALLEL\_SECONDO\_SLAVES
|
||
|
|
environment variables.
|
||
|
|
|
||
|
|
This class is used to verify these two files.
|
||
|
|
|
||
|
|
*/
|
||
|
|
class clusterInfo
|
||
|
|
{
|
||
|
|
public:
|
||
|
|
clusterInfo();
|
||
|
|
clusterInfo(clusterInfo& rhg);
|
||
|
|
|
||
|
|
~clusterInfo(){
|
||
|
|
if(dataServers){
|
||
|
|
delete dataServers;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
std::string getRemotePath(size_t loc,
|
||
|
|
bool includeMaster = true,
|
||
|
|
bool round = false,
|
||
|
|
bool appendTargetIP = true,
|
||
|
|
bool appendFileName = false,
|
||
|
|
std::string fileName = "",
|
||
|
|
bool attachProducerIP = false,
|
||
|
|
std::string producerIP = "");
|
||
|
|
|
||
|
|
std::string getMSECPath(size_t loc,
|
||
|
|
bool includeMaster = true, bool round = false,
|
||
|
|
bool appendIP = true);
|
||
|
|
/*
|
||
|
|
Get the remote mini-Secondo path
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
std::string getIP(size_t loc, bool round = false);
|
||
|
|
|
||
|
|
int getPort(size_t loc, bool round = false);
|
||
|
|
|
||
|
|
inline int getLocalNode(){
|
||
|
|
if (localNode < 0)
|
||
|
|
localNode = searchLocalNode();
|
||
|
|
return localNode;
|
||
|
|
}
|
||
|
|
|
||
|
|
inline int getMasterNode(){
|
||
|
|
return masterNode;
|
||
|
|
}
|
||
|
|
|
||
|
|
inline bool isLocalTheMaster(){
|
||
|
|
return (getLocalNode() == masterNode);
|
||
|
|
}
|
||
|
|
|
||
|
|
std::string getLocalIP();
|
||
|
|
|
||
|
|
/*
|
||
|
|
~getLocalPath~ function can only return the constant value
|
||
|
|
SecondoFilePath that is defined in SECONDO\_CONFIG file.
|
||
|
|
If the file location defined in PARALLEL\_SECONDO\_SLAVES
|
||
|
|
is different from the default value,
|
||
|
|
then the ~searchLocalNode~ function cannot return a correct result.
|
||
|
|
|
||
|
|
*/
|
||
|
|
inline std::string getLocalPath()
|
||
|
|
{
|
||
|
|
|
||
|
|
std::string confPath = std::string(getenv("SECONDO_CONFIG"));
|
||
|
|
std::string result = SmiProfile::GetParameter("ParallelSecondo",
|
||
|
|
"SecondoFilePath","", confPath);
|
||
|
|
if (result.find_last_of("/") == result.length() - 1)
|
||
|
|
result = result.substr(0, result.length() - 1);
|
||
|
|
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
|
||
|
|
inline bool isOK(){ return available; }
|
||
|
|
/*
|
||
|
|
Number of slaves
|
||
|
|
|
||
|
|
*/
|
||
|
|
inline size_t getSlaveSize(){
|
||
|
|
if (dataServers){
|
||
|
|
return (dataServers->size() - 1);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
Number of all DSs. If the mDS is also a sDS, then it is counted repeatedly.
|
||
|
|
|
||
|
|
*/
|
||
|
|
inline size_t getClusterSize(){
|
||
|
|
if (dataServers){
|
||
|
|
return dataServers->size();
|
||
|
|
}
|
||
|
|
else
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
void print();
|
||
|
|
|
||
|
|
NList toNestedList();
|
||
|
|
bool covers(NList& clusterList);
|
||
|
|
size_t getInterIndex(size_t loc,bool includeMaster,bool round);
|
||
|
|
|
||
|
|
private:
|
||
|
|
std::string ps_master;
|
||
|
|
std::string ps_slaves;
|
||
|
|
typedef std::pair<std::string, std::pair<std::string, int> > dservDesc;
|
||
|
|
//data server description
|
||
|
|
std::vector<dservDesc> *dataServers;
|
||
|
|
bool available;
|
||
|
|
int localNode;
|
||
|
|
int masterNode;
|
||
|
|
|
||
|
|
int searchLocalNode();
|
||
|
|
std::vector<std::string>* getAvailableIPAddrs();
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.4 FFeedLocalInfo Class
|
||
|
|
|
||
|
|
Assists ~ffeed~ operator.
|
||
|
|
Support progress estimation.
|
||
|
|
|
||
|
|
*/
|
||
|
|
class FFeedLocalInfo: public ProgressLocalInfo
|
||
|
|
{
|
||
|
|
public:
|
||
|
|
FFeedLocalInfo( Supplier s, bool _nf, int _prd, std::string _fp)
|
||
|
|
: tupleBlockFile(0), fileFound(false), noFlob(_nf), filePath(_fp)
|
||
|
|
{
|
||
|
|
|
||
|
|
if (noFlob)
|
||
|
|
{
|
||
|
|
/*
|
||
|
|
Prepared for the ~ffeed3~ operator.
|
||
|
|
The type in data file contains no DS\_IDX, while the type for output needs DS\_IDX
|
||
|
|
|
||
|
|
*/
|
||
|
|
std::string ostStr = ((FText*)qp->Request(
|
||
|
|
qp->GetSupplierSon(s, 4)).addr)->GetValue();
|
||
|
|
ListExpr rcdTypeList;
|
||
|
|
nl->ReadFromString(ostStr, rcdTypeList);
|
||
|
|
rcdTupleType = new TupleType(SecondoSystem::GetCatalog()
|
||
|
|
->NumericType(nl->Second(rcdTypeList)));
|
||
|
|
|
||
|
|
ListExpr newTypeList = qp->GetType(s);
|
||
|
|
newTupleType = new TupleType(SecondoSystem::GetCatalog()
|
||
|
|
->NumericType(nl->Second(newTypeList)));
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
ListExpr streamTypeList = qp->GetType(s);
|
||
|
|
rcdTupleType = new TupleType(SecondoSystem::GetCatalog()
|
||
|
|
->NumericType(nl->Second(streamTypeList)));
|
||
|
|
newTupleType = rcdTupleType;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
~FFeedLocalInfo() {
|
||
|
|
if (tupleBlockFile){
|
||
|
|
tupleBlockFile->close();
|
||
|
|
delete tupleBlockFile;
|
||
|
|
tupleBlockFile = 0;
|
||
|
|
}
|
||
|
|
if (rcdTupleType){
|
||
|
|
rcdTupleType->DeleteIfAllowed();
|
||
|
|
}
|
||
|
|
if (noFlob && newTupleType){
|
||
|
|
newTupleType->DeleteIfAllowed();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
bool fetchBlockFile(
|
||
|
|
std::string relName, std::string fileSuffix, Supplier s,
|
||
|
|
int pdi = -1, int tgi = -1, int att = -1);
|
||
|
|
|
||
|
|
Tuple* getNextTuple(); //~ffeed~
|
||
|
|
Tuple* getNextTuple2(); //~ffeed2~
|
||
|
|
Tuple* getNextTuple3(); //~ffeed3~
|
||
|
|
|
||
|
|
std::ifstream *tupleBlockFile;
|
||
|
|
TupleType* rcdTupleType; //The tuple type in the data file
|
||
|
|
TupleType* newTupleType; //The output tuple type
|
||
|
|
|
||
|
|
/*
|
||
|
|
In ~ffeed~, the rcdTupleType and the newTupleType are the same.
|
||
|
|
In ~ffeed2~, they are different, the newTupleType has
|
||
|
|
an additional integer attribute DS\_IDX.
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
private:
|
||
|
|
bool isLocalFileExist(std::string filePath);
|
||
|
|
bool fileFound;
|
||
|
|
bool noFlob;
|
||
|
|
std::string filePath;
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.5 structure ~fconsumeLocalInfo~
|
||
|
|
|
||
|
|
Assist ~fconsume~ operator.
|
||
|
|
|
||
|
|
*/
|
||
|
|
struct fconsumeLocalInfo
|
||
|
|
{
|
||
|
|
int state;
|
||
|
|
int current;
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.6 hdpJoinLocalInfo class
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
class hdpJoinLocalInfo
|
||
|
|
{
|
||
|
|
private:
|
||
|
|
std::vector<std::pair<int, int> > resultList;
|
||
|
|
std::vector<std::pair<int, int> >::iterator it;
|
||
|
|
TupleType *resultTupleType;
|
||
|
|
|
||
|
|
public:
|
||
|
|
hdpJoinLocalInfo(Supplier s){
|
||
|
|
|
||
|
|
ListExpr resultTTList = GetTupleResultType(s);
|
||
|
|
resultTupleType = new TupleType(nl->Second(resultTTList));
|
||
|
|
}
|
||
|
|
|
||
|
|
void insertPair(std::pair<int, int> elem){
|
||
|
|
resultList.push_back(elem);
|
||
|
|
}
|
||
|
|
|
||
|
|
void setIterator(){
|
||
|
|
it = resultList.begin();
|
||
|
|
}
|
||
|
|
|
||
|
|
Tuple* getTuple()
|
||
|
|
{
|
||
|
|
if (it == resultList.end())
|
||
|
|
return 0;
|
||
|
|
else {
|
||
|
|
Tuple *t = new Tuple(resultTupleType);
|
||
|
|
t->PutAttribute(0, new CcInt((*it).first));
|
||
|
|
t->PutAttribute(1, new CcInt((*it).second));
|
||
|
|
|
||
|
|
it++;
|
||
|
|
return t;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.7 fDistributeLocalInfo class
|
||
|
|
|
||
|
|
*/
|
||
|
|
class fileInfo{
|
||
|
|
public:
|
||
|
|
fileInfo(size_t _cs, std::string _fp, std::string _fn,
|
||
|
|
size_t _an, int _rs, bool _nf = false, SmiRecordId _sid = -1);
|
||
|
|
|
||
|
|
bool openFile();
|
||
|
|
|
||
|
|
void closeFile();
|
||
|
|
|
||
|
|
bool writeTuple(Tuple* tuple, size_t tupleIndex, TupleType* exTupleType,
|
||
|
|
int ai, bool kai, int aj = -1, bool kaj = false);
|
||
|
|
|
||
|
|
int writeLastDscr();
|
||
|
|
|
||
|
|
~fileInfo()
|
||
|
|
{
|
||
|
|
if (fileOpen){
|
||
|
|
blockFile.close();
|
||
|
|
fileOpen = false;
|
||
|
|
}
|
||
|
|
attrExtSize->clear();
|
||
|
|
delete attrExtSize;
|
||
|
|
attrSize->clear();
|
||
|
|
delete attrSize;
|
||
|
|
}
|
||
|
|
|
||
|
|
inline std::string getFileName() {
|
||
|
|
return blockFileName;
|
||
|
|
}
|
||
|
|
|
||
|
|
inline std::string getFilePath() {
|
||
|
|
return blockFilePath;
|
||
|
|
}
|
||
|
|
inline bool isFileOpen()
|
||
|
|
{ return fileOpen; }
|
||
|
|
inline size_t getLastTupleIndex()
|
||
|
|
{ return lastTupleIndex; }
|
||
|
|
|
||
|
|
private:
|
||
|
|
|
||
|
|
int cnt;
|
||
|
|
double totalExtSize;
|
||
|
|
double totalSize;
|
||
|
|
std::vector<double>* attrExtSize;
|
||
|
|
std::vector<double>* attrSize;
|
||
|
|
|
||
|
|
std::string blockFilePath, blockFileName;
|
||
|
|
SmiFileId flobFileId;
|
||
|
|
SmiRecordId sourceDS;
|
||
|
|
std::string flobFilePath, flobFileName;
|
||
|
|
std::ofstream blockFile, flobFile;
|
||
|
|
SmiSize flobBlockOffset;
|
||
|
|
std::map<std::pair<SmiFileId, SmiRecordId>, SmiSize> flobIdCache;
|
||
|
|
|
||
|
|
size_t lastTupleIndex;
|
||
|
|
bool fileOpen, noFlob;
|
||
|
|
};
|
||
|
|
|
||
|
|
bool static compFileInfo(fileInfo* fi1, fileInfo* fi2)
|
||
|
|
{
|
||
|
|
return fi1->getLastTupleIndex() > fi2->getLastTupleIndex();
|
||
|
|
}
|
||
|
|
class FDistributeLocalInfo
|
||
|
|
{
|
||
|
|
|
||
|
|
private:
|
||
|
|
size_t HashTuple(Tuple* tuple)
|
||
|
|
{
|
||
|
|
size_t hashValue =
|
||
|
|
((Attribute*)tuple->GetAttribute(attrIndex))->HashValue();
|
||
|
|
if (nBuckets != 0)
|
||
|
|
return hashValue % nBuckets;
|
||
|
|
else
|
||
|
|
return hashValue;
|
||
|
|
}
|
||
|
|
bool openFile(fileInfo* tgtFile);
|
||
|
|
|
||
|
|
size_t nBuckets;
|
||
|
|
int attrIndex;
|
||
|
|
bool kpa;
|
||
|
|
TupleType *resultTupleType, *exportTupleType;
|
||
|
|
size_t tupleCounter;
|
||
|
|
|
||
|
|
std::string fileBaseName;
|
||
|
|
int rowNumSuffix;
|
||
|
|
std::string filePath;
|
||
|
|
std::map<size_t, fileInfo*> fileList;
|
||
|
|
std::map<size_t, fileInfo*>::iterator fit;
|
||
|
|
|
||
|
|
//data remote variables
|
||
|
|
int firstDupTarget, dupTimes, localIndex;
|
||
|
|
std::string cnIP; //current node IP
|
||
|
|
clusterInfo *ci;
|
||
|
|
bool* copyList;
|
||
|
|
bool noFlob;
|
||
|
|
SmiRecordId sourceDS;
|
||
|
|
bool ok;
|
||
|
|
|
||
|
|
//~openFileList~ keeps at most MAX_FILEHANDLENUM file handles.
|
||
|
|
std::vector<fileInfo*> openFileList;
|
||
|
|
bool duplicateOneFile(fileInfo* fi);
|
||
|
|
|
||
|
|
public:
|
||
|
|
FDistributeLocalInfo(std::string baseName, int rowNum,
|
||
|
|
std::string path,
|
||
|
|
int nBuckets, int attrIndex,
|
||
|
|
bool keepKeyAttribute,
|
||
|
|
ListExpr resultTupleType,
|
||
|
|
ListExpr inputTupleType,
|
||
|
|
int dtIndex, int dupTimes, bool noFlob);
|
||
|
|
|
||
|
|
bool insertTuple(Word tuple);
|
||
|
|
bool startCloseFiles();
|
||
|
|
Tuple* closeOneFile();
|
||
|
|
|
||
|
|
~FDistributeLocalInfo(){
|
||
|
|
std::map<size_t, fileInfo*>::iterator fit;
|
||
|
|
for(fit = fileList.begin(); fit != fileList.end(); fit++)
|
||
|
|
{ delete (*fit).second; }
|
||
|
|
|
||
|
|
openFileList.clear();
|
||
|
|
fileList.clear();
|
||
|
|
if (resultTupleType)
|
||
|
|
resultTupleType->DeleteIfAllowed();
|
||
|
|
if (exportTupleType)
|
||
|
|
exportTupleType->DeleteIfAllowed();
|
||
|
|
}
|
||
|
|
|
||
|
|
bool isOK(){ return ok;}
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.8 FetchFlobLocalInfo class
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
class FlobSheet1;
|
||
|
|
|
||
|
|
class FetchFlobLocalInfo1 : public ProgressLocalInfo
|
||
|
|
{
|
||
|
|
public:
|
||
|
|
FetchFlobLocalInfo1(const Supplier s, NList resultTypeList,
|
||
|
|
NList raList, NList daList);
|
||
|
|
|
||
|
|
~FetchFlobLocalInfo1();
|
||
|
|
|
||
|
|
Tuple* getNextTuple(const Supplier s);
|
||
|
|
|
||
|
|
void fetching2prepared(FlobSheet1* fs);
|
||
|
|
/*
|
||
|
|
When one Flob sheet is finished, copy the file back
|
||
|
|
and set it into the preparedSheets.
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
void clearFetchedFiles();
|
||
|
|
|
||
|
|
inline int getLocalDS(){
|
||
|
|
return ci->getLocalNode();
|
||
|
|
}
|
||
|
|
|
||
|
|
inline std::string getMSecPath(int dest, bool appendIP = true){
|
||
|
|
return ci->getMSECPath(dest, true, false, appendIP);
|
||
|
|
}
|
||
|
|
|
||
|
|
inline std::string getPSFSPath(int dest, bool appendIP = true){
|
||
|
|
return ci->getRemotePath(dest, true, false, appendIP);
|
||
|
|
}
|
||
|
|
|
||
|
|
inline std::string getIP(int dest) {
|
||
|
|
return ci->getIP(dest);
|
||
|
|
}
|
||
|
|
|
||
|
|
private:
|
||
|
|
Tuple* setResultTuple(Tuple* tuple);
|
||
|
|
// Tuple* setLocalFlob(Tuple* tuple);
|
||
|
|
|
||
|
|
Tuple* readLocalFlob(Tuple* tuple);
|
||
|
|
int getSheetKey(const std::vector<int>& sdss);
|
||
|
|
int getMaxSheetKey();
|
||
|
|
|
||
|
|
std::string LFPath; //local flob file path
|
||
|
|
NList faList;
|
||
|
|
NList daList; //deleted attribute list
|
||
|
|
TupleType* resultType;
|
||
|
|
|
||
|
|
/*
|
||
|
|
Use array instead of NList for the attribute list faList,
|
||
|
|
since the NList is NOT thread-safe.
|
||
|
|
|
||
|
|
*/
|
||
|
|
int* faVec;
|
||
|
|
size_t faLen;
|
||
|
|
|
||
|
|
clusterInfo *ci;
|
||
|
|
int cds; //index of the current DS
|
||
|
|
bool moreInput;
|
||
|
|
|
||
|
|
/*
|
||
|
|
Three sheet lists are used to indicate the their status:
|
||
|
|
|
||
|
|
* standby: unsent sheets, collecting input tuples and their FlobOrder.
|
||
|
|
Each DS has one slot here.
|
||
|
|
|
||
|
|
* fetching: preparing the sheet by copying the flob file within a thread.
|
||
|
|
|
||
|
|
* prepared: the flob file is prepared. It has unlimited size.
|
||
|
|
|
||
|
|
|
||
|
|
Instead of using the fetching vector, I simply use a fetchingNum to find how many
|
||
|
|
sheets are being fetched.
|
||
|
|
During the fetching procedure, the sheet is kept inside the thread object.
|
||
|
|
Since using the vector makes the same sheet to be kept twice:
|
||
|
|
in the thread and the vector.
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
std::vector<FlobSheet1*>* standby;
|
||
|
|
int fetchingNum;
|
||
|
|
std::vector<FlobSheet1*>* prepared;
|
||
|
|
int *sheetCounter; //Count the sent sheet number for each DS
|
||
|
|
std::vector<std::string>* fetchedFiles;
|
||
|
|
int preparedNum;
|
||
|
|
FlobSheet1* pfs;
|
||
|
|
|
||
|
|
int maxSheetMem;
|
||
|
|
static const int PipeWidth = 20;
|
||
|
|
//thread tokens
|
||
|
|
bool tokenPass[PipeWidth];
|
||
|
|
pthread_t threadID[PipeWidth];
|
||
|
|
static void* sendSheetThread(void* ptr);
|
||
|
|
bool sendSheet(FlobSheet1* fs, int times);
|
||
|
|
static pthread_mutex_t FFLI_mutex1;
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
Thread for fetching a remote file
|
||
|
|
|
||
|
|
*/
|
||
|
|
class FFLI_Thread1
|
||
|
|
{
|
||
|
|
public:
|
||
|
|
FFLI_Thread1(FetchFlobLocalInfo1* pt, FlobSheet1* _fs, int _t, int _tk):
|
||
|
|
ffli(pt), sheet(_fs), times(_t), token(_tk){}
|
||
|
|
|
||
|
|
FetchFlobLocalInfo1* ffli;
|
||
|
|
FlobSheet1* sheet;
|
||
|
|
int times;
|
||
|
|
int token;
|
||
|
|
};
|
||
|
|
|
||
|
|
|
||
|
|
/*
|
||
|
|
Flob orders prepared for one tuple,
|
||
|
|
which may contain Flobs coming from several Data Servers.
|
||
|
|
|
||
|
|
*/
|
||
|
|
class FlobSheet1
|
||
|
|
{
|
||
|
|
public:
|
||
|
|
|
||
|
|
FlobSheet1(const std::vector<int>& _sources, const int _si,
|
||
|
|
const NList& _faList, int maxMemory)
|
||
|
|
: sheetIndex(_si), cachedSize(0), maxMem(maxMemory), faList(_faList){
|
||
|
|
|
||
|
|
flobFiles = new std::map<int, std::pair<std::string, size_t> >();
|
||
|
|
buffer = new TupleBuffer(maxMem);
|
||
|
|
it = 0;
|
||
|
|
rtCounter = 0;
|
||
|
|
toCounter = 0;
|
||
|
|
|
||
|
|
faLen = faList.length();
|
||
|
|
faVec = new int[faLen];
|
||
|
|
int faCounter = 0;
|
||
|
|
NList rest = faList;
|
||
|
|
while (!rest.isEmpty()){
|
||
|
|
int ai = rest.first().intval();
|
||
|
|
int source = _sources[faCounter];
|
||
|
|
flobFiles->insert(std::make_pair(ai, std::make_pair("", 0)));
|
||
|
|
sourceDSs.push_back(source);
|
||
|
|
faVec[faCounter] = ai;
|
||
|
|
rest.rest();
|
||
|
|
faCounter++;
|
||
|
|
}
|
||
|
|
finished = false;
|
||
|
|
}
|
||
|
|
|
||
|
|
~FlobSheet1(){
|
||
|
|
delete flobFiles;
|
||
|
|
delete buffer;
|
||
|
|
}
|
||
|
|
|
||
|
|
bool addOrder(Tuple* tuple);
|
||
|
|
/*
|
||
|
|
Put one tuple into the sheet, if the buffer limit is not reached.
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
Tuple* getCachedTuple();
|
||
|
|
|
||
|
|
const std::vector<int>& getSDSs() { return sourceDSs;}
|
||
|
|
|
||
|
|
inline void makeScan(){
|
||
|
|
it = buffer->MakeScan();
|
||
|
|
}
|
||
|
|
|
||
|
|
inline std::string getResultFilePath(int attrId){
|
||
|
|
return flobFiles->find(attrId)->second.first;
|
||
|
|
}
|
||
|
|
|
||
|
|
std::string setSheet(int source, int dest, int times, int attrId);
|
||
|
|
|
||
|
|
std::string setResultFile(int source, int dest, int times, int attrId);
|
||
|
|
|
||
|
|
inline bool isFinished(){ return finished;}
|
||
|
|
|
||
|
|
std::ostream& print(std::ostream& os) const {
|
||
|
|
|
||
|
|
os << "Sheet index is: " << sheetIndex << endl;
|
||
|
|
os << "source DSs: ";
|
||
|
|
for (std::vector<int>::const_iterator it = sourceDSs.begin();
|
||
|
|
it != sourceDSs.end(); it++){
|
||
|
|
os << (*it) << " ";
|
||
|
|
}
|
||
|
|
os << endl;
|
||
|
|
|
||
|
|
os << "flobFiles: ------------- " << endl;
|
||
|
|
for (std::map<int, std::pair<std::string, size_t> >::iterator
|
||
|
|
it = flobFiles->begin();
|
||
|
|
it != flobFiles->end(); it++ ){
|
||
|
|
os << "attrId(" << it->first << ") : " << it->second.first
|
||
|
|
<< " offset(" << it->second.second << ")" << endl;
|
||
|
|
}
|
||
|
|
os << "-----------------------------" << endl;
|
||
|
|
|
||
|
|
os << "maxsize (" << maxMem
|
||
|
|
<< ") cachedSize (" << cachedSize << ")" << endl;
|
||
|
|
os << "Cached tuples : " << buffer->GetNoTuples() << endl;
|
||
|
|
os << "Read tuples : " << rtCounter << endl;
|
||
|
|
os << "Prepared Flob Orders: " << toCounter << endl;
|
||
|
|
os << endl << endl;
|
||
|
|
|
||
|
|
return os;
|
||
|
|
}
|
||
|
|
|
||
|
|
inline int getSheetIndex() {
|
||
|
|
return sheetIndex;
|
||
|
|
}
|
||
|
|
|
||
|
|
private:
|
||
|
|
// Map with : (attrId (flobFilePath, flobOffset))
|
||
|
|
//Collect one Flob for each attribute, hence use the attribute id as the key
|
||
|
|
std::map<int, std::pair<std::string, size_t> >* flobFiles;
|
||
|
|
|
||
|
|
std::vector<int> sourceDSs;
|
||
|
|
int sheetIndex;
|
||
|
|
int cachedSize;
|
||
|
|
/*
|
||
|
|
If it is larger than a certain threshold, stop adding other order,
|
||
|
|
then fetch the needed flob file.
|
||
|
|
|
||
|
|
*/
|
||
|
|
int maxMem;
|
||
|
|
|
||
|
|
size_t toCounter; // tuple order counter
|
||
|
|
|
||
|
|
/*
|
||
|
|
newRecIds is used to record the id of all new Flobs created within this sheet.
|
||
|
|
It happens that a Flob is listed in the current sheet,
|
||
|
|
but has already been created in another sheet.
|
||
|
|
|
||
|
|
*/
|
||
|
|
std::set<SmiRecordId> newRecIds;
|
||
|
|
|
||
|
|
TupleBuffer* buffer;
|
||
|
|
GenericRelationIterator* it;
|
||
|
|
size_t rtCounter;
|
||
|
|
|
||
|
|
NList faList; //Flob Attribute list
|
||
|
|
size_t faLen;
|
||
|
|
int* faVec;
|
||
|
|
bool finished;
|
||
|
|
};
|
||
|
|
|
||
|
|
std::ostream& operator<<(std::ostream& os, const FlobSheet1& f);
|
||
|
|
|
||
|
|
/*
|
||
|
|
1.9 FetchFlobLocalInfo class (2rd version)
|
||
|
|
|
||
|
|
This update is prepared to avoid creating massive small flob files in the last version.
|
||
|
|
|
||
|
|
One FlobSheet collects the Flob data from the same DS of all tuples.
|
||
|
|
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
class flobInfoT{
|
||
|
|
|
||
|
|
public:
|
||
|
|
flobInfoT(const Flob& flob) : pLob(flob){
|
||
|
|
cfOffset = rfOffset = 0;
|
||
|
|
mode = flob.getMode();
|
||
|
|
sourceDS = flob.getRecordId();
|
||
|
|
size = flob.getSize();
|
||
|
|
}
|
||
|
|
|
||
|
|
SmiSize cfOffset; //Offset within the collected file
|
||
|
|
SmiSize rfOffset; //Offset within the file being read
|
||
|
|
char mode;
|
||
|
|
SmiRecordId sourceDS;
|
||
|
|
SmiSize size;
|
||
|
|
|
||
|
|
Flob pLob;
|
||
|
|
};
|
||
|
|
|
||
|
|
typedef std::pair<SmiFileId, SmiSize> flobKeyT;
|
||
|
|
|
||
|
|
class FlobSheet
|
||
|
|
{
|
||
|
|
public:
|
||
|
|
|
||
|
|
FlobSheet(int source, int dest, int times, int maxMemory);
|
||
|
|
/*
|
||
|
|
Create the result flob file path here.
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
bool addOrder(Flob* flob);
|
||
|
|
/*
|
||
|
|
Add one more flob order to the current sheet,
|
||
|
|
returns whether the sheet is full.
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
SmiSize getCFOffset(SmiFileId fileId, SmiSize oldOffset);
|
||
|
|
|
||
|
|
SmiSize getRFOffset(SmiFileId fileId, SmiSize oldOffset);
|
||
|
|
|
||
|
|
void initializeAllFlobs();
|
||
|
|
|
||
|
|
void shuffleCollectedFlobs();
|
||
|
|
|
||
|
|
bool findInitializedFlob(Flob* result,
|
||
|
|
std::map<flobKeyT, flobInfoT>::iterator& mit);
|
||
|
|
|
||
|
|
void closeSheetFile();
|
||
|
|
|
||
|
|
void killAllCachedFlobs();
|
||
|
|
|
||
|
|
inline std::string getResultFile(){ return resultFlobFilePath;}
|
||
|
|
|
||
|
|
inline int getTimes() { return times;}
|
||
|
|
|
||
|
|
inline int getSource() { return sourceDSId; }
|
||
|
|
|
||
|
|
inline int getDest() { return destDSId; }
|
||
|
|
|
||
|
|
inline std::string getSheetPath() { return sheetFilePath;}
|
||
|
|
|
||
|
|
inline bool isAllPrepared() { return dataInitialized;}
|
||
|
|
|
||
|
|
std::ostream& print(std::ostream& os) const;
|
||
|
|
private:
|
||
|
|
|
||
|
|
std::string resultFlobFilePath;
|
||
|
|
int sourceDSId;
|
||
|
|
int destDSId;
|
||
|
|
int times;
|
||
|
|
size_t cachedSize;
|
||
|
|
size_t maxMem;
|
||
|
|
|
||
|
|
bool dataInitialized;
|
||
|
|
|
||
|
|
std::string sheetFilePath;
|
||
|
|
std::map<flobKeyT, flobInfoT> lobMarkers;
|
||
|
|
};
|
||
|
|
|
||
|
|
std::ostream& operator<<(std::ostream& os, const FlobSheet& f);
|
||
|
|
|
||
|
|
/*
|
||
|
|
Flob information for a tuple
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
class FFLI_Thread;
|
||
|
|
|
||
|
|
class TupleFlobInfo
|
||
|
|
{
|
||
|
|
public:
|
||
|
|
TupleFlobInfo(size_t _aSize, size_t _mfSize):
|
||
|
|
returned(false), attrSize(_aSize), maxFlobSize(_mfSize){
|
||
|
|
|
||
|
|
dsVec = new int*[attrSize];
|
||
|
|
timesVec = new int*[attrSize];
|
||
|
|
for (size_t i = 0; i < attrSize; i++){
|
||
|
|
dsVec[i] = new int[maxFlobSize];
|
||
|
|
timesVec[i] = new int[maxFlobSize];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
~TupleFlobInfo(){
|
||
|
|
|
||
|
|
for (size_t i = 0; i < attrSize; i++){
|
||
|
|
delete[] dsVec[i];
|
||
|
|
delete[] timesVec[i];
|
||
|
|
}
|
||
|
|
|
||
|
|
delete[] dsVec;
|
||
|
|
delete[] timesVec;
|
||
|
|
}
|
||
|
|
|
||
|
|
TupleFlobInfo(const TupleFlobInfo& r){
|
||
|
|
returned = r.returned;
|
||
|
|
attrSize = r.attrSize;
|
||
|
|
maxFlobSize = r.maxFlobSize;
|
||
|
|
|
||
|
|
dsVec = new int*[attrSize];
|
||
|
|
timesVec = new int*[attrSize];
|
||
|
|
for (size_t i = 0; i < attrSize; i++){
|
||
|
|
dsVec[i] = new int[maxFlobSize];
|
||
|
|
timesVec[i] = new int[maxFlobSize];
|
||
|
|
|
||
|
|
memcpy(dsVec[i], r.dsVec[i], maxFlobSize*sizeof(int));
|
||
|
|
memcpy(timesVec[i], r.timesVec[i], maxFlobSize*sizeof(int));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
inline void setReturned(){ returned = true; }
|
||
|
|
|
||
|
|
inline void setFlobInfo(size_t aid, size_t fid,
|
||
|
|
int ds, int times){
|
||
|
|
/*
|
||
|
|
Here the aid is not the attribute id,
|
||
|
|
but only the id among the requested Flob attributes
|
||
|
|
|
||
|
|
*/
|
||
|
|
dsVec[aid][fid] = ds;
|
||
|
|
timesVec[aid][fid] = times;
|
||
|
|
}
|
||
|
|
|
||
|
|
inline bool isReturned() { return returned;}
|
||
|
|
|
||
|
|
inline int getDS(size_t attrId, size_t fId) {
|
||
|
|
return dsVec[attrId][fId];}
|
||
|
|
|
||
|
|
inline int getSheetTimes(size_t attrId, size_t fId) {
|
||
|
|
return timesVec[attrId][fId]; }
|
||
|
|
|
||
|
|
|
||
|
|
std::ostream& print(std::ostream& os) const{
|
||
|
|
os << "Returned: " << (returned ? "True" : "False") << endl;
|
||
|
|
os << "Attribute Size: " << attrSize << endl;
|
||
|
|
os << "Maximum Flob Size: " << maxFlobSize << endl;
|
||
|
|
for (size_t i = 0; i < attrSize; i++){
|
||
|
|
for (size_t k = 0; k < maxFlobSize; k++){
|
||
|
|
os << "\t" << i << "_" << k << ": source(" << dsVec[i][k] << ") "
|
||
|
|
<< "times(" << timesVec[i][k] << ") "
|
||
|
|
<< endl;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return os;
|
||
|
|
}
|
||
|
|
|
||
|
|
private:
|
||
|
|
TupleFlobInfo(){}
|
||
|
|
|
||
|
|
bool returned;
|
||
|
|
size_t attrSize;
|
||
|
|
size_t maxFlobSize; //The maximum Flob number for all Flob attributes
|
||
|
|
int** dsVec; //The DS list for all Flob attributes
|
||
|
|
int** timesVec; //The sheet times of all Flob attributes
|
||
|
|
};
|
||
|
|
|
||
|
|
std::ostream& operator<<(std::ostream& os, const TupleFlobInfo& f);
|
||
|
|
|
||
|
|
class FetchFlobLocalInfo : public ProgressLocalInfo
|
||
|
|
{
|
||
|
|
public:
|
||
|
|
FetchFlobLocalInfo(const Supplier s,
|
||
|
|
NList resultTypeList, NList _fal, NList _dal);
|
||
|
|
|
||
|
|
~FetchFlobLocalInfo(){
|
||
|
|
if (resultType)
|
||
|
|
resultType->DeleteIfAllowed();
|
||
|
|
|
||
|
|
pthread_mutex_destroy(&FFLI_mutex);
|
||
|
|
|
||
|
|
if(ci){
|
||
|
|
delete ci;
|
||
|
|
for(std::map<std::pair<int, int>, FlobSheet*>::iterator
|
||
|
|
ci = prepared->begin();
|
||
|
|
ci != prepared->end(); ci++){
|
||
|
|
FlobSheet* fs = ci->second;
|
||
|
|
delete fs;
|
||
|
|
}
|
||
|
|
delete prepared;
|
||
|
|
delete fetchedFiles;
|
||
|
|
delete []sheetCounter;
|
||
|
|
if (tbList)
|
||
|
|
{
|
||
|
|
if (!tbList->empty()){
|
||
|
|
for (std::map<size_t, tupleListT>::iterator ti = tbList->begin();
|
||
|
|
ti != tbList->end(); ti++){
|
||
|
|
delete ti->second.first;
|
||
|
|
ti->second.second->clear();
|
||
|
|
delete ti->second.second;
|
||
|
|
}
|
||
|
|
tbList->clear();
|
||
|
|
}
|
||
|
|
delete tbList;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (totalBufferedTuples){
|
||
|
|
delete totalBufferedTuples;
|
||
|
|
totalBufferedTuples = 0;
|
||
|
|
totalBufferedTupleInfo->clear();
|
||
|
|
delete totalBufferedTupleInfo;
|
||
|
|
totalBufferedTupleInfo = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
Tuple* getNextTuple(const Supplier s);
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
void fetching2prepared(FlobSheet* fs);
|
||
|
|
|
||
|
|
void clearFetchedFiles();
|
||
|
|
|
||
|
|
inline int getLocalDS(){
|
||
|
|
assert(ci);
|
||
|
|
return ci->getLocalNode();
|
||
|
|
}
|
||
|
|
|
||
|
|
inline std::string getPSFSPath(int dest, bool appendIP = true){
|
||
|
|
return ci->getRemotePath(dest, true, false, appendIP);
|
||
|
|
}
|
||
|
|
|
||
|
|
inline std::string getIP(int dest) {
|
||
|
|
return ci->getIP(dest);
|
||
|
|
}
|
||
|
|
|
||
|
|
inline int getPort(int dest) {
|
||
|
|
return ci->getPort(dest);
|
||
|
|
}
|
||
|
|
|
||
|
|
inline std::string getMSecPath(int dest, bool appendIP = true){
|
||
|
|
return ci->getMSECPath(dest, true, false, appendIP);
|
||
|
|
}
|
||
|
|
|
||
|
|
private:
|
||
|
|
typedef std::pair<TupleQueueHP*, std::vector<TupleFlobInfo>*> tupleListT;
|
||
|
|
|
||
|
|
Tuple* setResultTuple(Tuple* tuple);
|
||
|
|
/*
|
||
|
|
Remove useless Flob attribute
|
||
|
|
|
||
|
|
*/
|
||
|
|
Tuple* readLocalFlob(Tuple* tuple);
|
||
|
|
/*
|
||
|
|
Read the Flob data from all flob files that are collected from remote DSs.
|
||
|
|
|
||
|
|
*/
|
||
|
|
|
||
|
|
/*
|
||
|
|
Create a set of small tuple buffers based on all possible DataServer combinations,
|
||
|
|
instead of using a large global tuple buffer, in order to erase useless tuples
|
||
|
|
and their Flob data as soon as possible.
|
||
|
|
|
||
|
|
*/
|
||
|
|
std::map<size_t, tupleListT>* tbList;
|
||
|
|
TupleQueueHP* tbfIt; //current tuple queue
|
||
|
|
std::vector<TupleFlobInfo>::iterator tifIt; //tuple info iterator
|
||
|
|
size_t curKey;
|
||
|
|
Tuple* getTupleFromBuffer1();
|
||
|
|
void orderOneTuple1(Tuple* tuple, int sds[]);
|
||
|
|
|
||
|
|
std::map<std::pair<int, int>, FlobSheet*> ruSheets; //recently used sheets
|
||
|
|
|
||
|
|
//Use one large TupleQueueHP to increase the possibility of
|
||
|
|
// reusing cached Flob
|
||
|
|
TupleQueueHP* totalBufferedTuples;
|
||
|
|
std::list<TupleFlobInfo>* totalBufferedTupleInfo;
|
||
|
|
Tuple* getTupleFromBuffer();
|
||
|
|
void orderOneTuple(Tuple* tuple);
|
||
|
|
|
||
|
|
|
||
|
|
size_t getKey(int sdsVec[]);
|
||
|
|
size_t getMaxKey();
|
||
|
|
void decodeKey(size_t key, int sdsVec[]);
|
||
|
|
size_t maxBufferSize, maxSheetSize;
|
||
|
|
size_t perBufferSize;
|
||
|
|
|
||
|
|
bool isReadAll(Tuple* tuple);
|
||
|
|
bool isPreparedAll(Tuple* tuple, TupleFlobInfo* tif);
|
||
|
|
bool isPreparedAll(Tuple* tuple, TupleFlobInfo* tif,
|
||
|
|
std::vector<std::vector<FlobSheet*> >& rs);
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
int *faVec, *daVec; //record all needed and deleted Flob attribute list
|
||
|
|
size_t faLen, daLen;
|
||
|
|
int maxFlobNum;
|
||
|
|
TupleType* resultType;
|
||
|
|
std::string LFPath; //Local Flob File Path, prepared for mode 2
|
||
|
|
|
||
|
|
|
||
|
|
clusterInfo* ci;
|
||
|
|
int cds; //current DS
|
||
|
|
bool moreInput; //indicate whether there are more input tuples
|
||
|
|
|
||
|
|
/*
|
||
|
|
Three sheet lists are used to indicate the their status:
|
||
|
|
|
||
|
|
* standby: unsent sheets, collecting FlobOrder for each DS.
|
||
|
|
|
||
|
|
* fetching: copying the requested flob file within a thread.
|
||
|
|
|
||
|
|
* prepared: the flob file is prepared. Considering one sheet may be split into
|
||
|
|
several pieces, each for a flob file within a size threshold. There is no length
|
||
|
|
limitation on this list.
|
||
|
|
|
||
|
|
|
||
|
|
Instead of using the fetching vector, I simply use a fetchingNum to find how many
|
||
|
|
sheets are being fetched.
|
||
|
|
I also use preparedNumber to indicate how many files are prepared but unfetched.
|
||
|
|
During the fetching procedure, the sheet is kept inside the thread object.
|
||
|
|
Since using the vector makes the same sheet to be kept twice:
|
||
|
|
in the thread and the vector.
|
||
|
|
|
||
|
|
The prepaerd is a map structure, with the (sourceID, times) as the key.
|
||
|
|
In order to quickly find whether the asked flob file is prepared.
|
||
|
|
|
||
|
|
*/
|
||
|
|
std::vector<FlobSheet*>* standby;
|
||
|
|
size_t fetchingNum;
|
||
|
|
std::map<std::pair<int, int>, FlobSheet*>* prepared;
|
||
|
|
|
||
|
|
size_t preparedNum;
|
||
|
|
std::vector<std::string>* fetchedFiles;
|
||
|
|
//all fetched files are deleted at last.
|
||
|
|
int *sheetCounter; //the number of sent sheet for each DS
|
||
|
|
|
||
|
|
static const size_t PipeWidth = 10;
|
||
|
|
bool tokenPass[PipeWidth];
|
||
|
|
pthread_t threadID[PipeWidth];
|
||
|
|
static pthread_mutex_t FFLI_mutex;
|
||
|
|
static void* sendSheetThread1(void* ptr);
|
||
|
|
static void* sendSheetThread2(void* ptr);
|
||
|
|
static void* sendSheetThread(void* ptr);
|
||
|
|
bool sendSheet(FlobSheet* fs);
|
||
|
|
};
|
||
|
|
|
||
|
|
class FFLI_Thread
|
||
|
|
{
|
||
|
|
public:
|
||
|
|
FFLI_Thread(FetchFlobLocalInfo* _ffli, FlobSheet* _fs, int _tk):
|
||
|
|
ffli(_ffli), sheet(_fs), token(_tk){
|
||
|
|
times = sheet->getTimes();
|
||
|
|
}
|
||
|
|
|
||
|
|
FetchFlobLocalInfo* ffli;
|
||
|
|
FlobSheet* sheet;
|
||
|
|
int times;
|
||
|
|
int token;
|
||
|
|
};
|
||
|
|
|
||
|
|
template<typename T1, typename T2>
|
||
|
|
struct classCompPair
|
||
|
|
{
|
||
|
|
bool operator() (const std::pair<T1,T2>& pair_1,
|
||
|
|
const std::pair<T1,T2>& pair_2) const
|
||
|
|
{
|
||
|
|
return (pair_1.first < pair_2.first)
|
||
|
|
|| (!(pair_1.first > pair_2.first && pair_1.first < pair_2.first)
|
||
|
|
&& (pair_1.second < pair_2.second));
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
#endif /* HADOOPPARALLELALGEBRA_H_ */
|