/* ---- This file is part of SECONDO. Copyright (C) 2012, University in Hagen Faculty of Mathematic and Computer Science, Database Systems for New Applications. SECONDO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. SECONDO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with SECONDO; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ---- */ #include "SecondoSMI.h" #include #include #include "NestedList.h" #include "ListUtils.h" #include "StringUtils.h" #include "VTrie2.h" #include "MMTrie.h" #include "LRU.h" #include "Stream.h" #include "Algebras/TupleIdentifier/TupleIdentifier.h" #include "Algebras/FText/FTextAlgebra.h" #include "Algebras/Relation-C++/RelationAlgebra.h" /* 1 Foreword In this file several classes realizing an inverted File index are defined. */ typedef SmiRecordId TrieContentType; // content of trie entries typedef vtrie::VTrieIterator TrieIteratorType; typedef vtrie::VTrieNode TrieNodeType; typedef vtrie::VTrieNodeCache TrieNodeCacheType; typedef vtrie::VTrie TrieType; //typedef trie::TrieIterator TrieIteratorType; //typedef trie::TrieNode TrieNodeType; //typedef trie::TrieNodeCache TrieNodeCacheType; //typedef trie::Trie TrieType; namespace appendcache{ /* 2 AppendCache When building an enverted file index a lot of append operations to existing records within the file containing the inverted lists are performed. To accelerate these appens, the class appendCache can be used. */ /* 2.1 Class CacheEntry This class represents the non persistent part of a record. */ class CacheEntry{ public: /* 2.1.1 Constructor This constructor creates a cache entry for the given record id. The size of this record is specified in ~currentRecordSize~. The size of the buffer corresponds to the ~slotSize~ argument. The content of the buffer will be swapped to disk if the buffer is full or the function ~bringToDisk~ is called. */ CacheEntry(const SmiRecordId _id, const size_t currentRecordSize, const size_t _slotSize): id(_id), offset(currentRecordSize), length(0), slotSize(_slotSize) { buffer = new char[slotSize]; } CacheEntry(const CacheEntry& e) : id(e.id), offset(e.offset),length(e.length), slotSize(e.slotSize){ buffer = new char[slotSize]; memcpy(buffer,e.buffer,slotSize); } /* 2.1.2 Destructor */ ~CacheEntry(){ assert(length==0); delete[] buffer; } /* 2.1.3 bringToDisk Writes the used part of the buffer to disk and empties the buffer for further append calls. */ void bringToDisk(SmiRecordFile* file){ appendToDisk(file,buffer, length); length = 0; } void appendToDisk(SmiRecordFile* file, const char* buffer, const size_t length){ SmiRecord record; file->SelectRecord(id, record, SmiFile::Update); record.Write(buffer, length, offset); offset += length; record.Finish(); } /* 2.1.4 append Appends data to this entry. In case of an overflow, the buffer is written to disk and emptied for further append calls. */ void append(SmiRecordFile* file, const char* buffer, const size_t length){ if(this->length + length > slotSize){ // new buffer does not fit into internal buffer bringToDisk(file); } if(length > slotSize){ // new buffer larger than slotsize appendToDisk(file,buffer, length); } else { memcpy(this->buffer + this->length, buffer, length); this->length += length; } } private: SmiRecordId id; // id for persistent storage size_t offset; // offset in persistent part size_t length; // use bufferlength in memory size_t slotSize; // buffer size in memory char* buffer; // data buffer }; /* 2.2 RecordAppendCache This class realizes an cache for Records only suporting append operation to these records. The cache uses the LRU strategy for writing record contents to disk. */ class RecordAppendCache{ public: /* 2.2.1 Constructor Creates a new cache for the underlying file, with a maximum memory consumptions of ~maxMem~ and a buffer size of ~slotSize~ for each record. */ RecordAppendCache(SmiRecordFile* _file, const size_t _maxMem, const size_t _slotSize): file(_file), lru(_maxMem / (_slotSize + sizeof(appendcache::CacheEntry))), slotSize(_slotSize){ } /* 2.2.2 Destructor Empties the cache writing all in memory data to disk. */ ~RecordAppendCache(){ clear(); } /* 2.2.3 append Appends data to the specified record. */ void append(SmiRecordId id, const char* buffer, const size_t length){ appendcache::CacheEntry** entry = lru.get(id); appendcache::CacheEntry* ce=0; if(entry==0){ // not cached SmiRecord record; file->SelectRecord(id, record); ce = new appendcache::CacheEntry(id, record.Size(), slotSize); record.Finish(); LRUEntry* e2 = lru.use(id, ce); if(e2!=0){ e2->value->bringToDisk(file); delete e2->value; delete e2; } entry = lru.get(id); assert(entry!=0); } ce = *entry; ce->append(file, buffer, length); } /* 2.2.4 clear Removes all entries from the cache writing the buffers to disk. */ void clear(){ LRUEntry* victim; while( (victim = lru.deleteLast())!=0){ victim->value->bringToDisk(file); delete victim->value; delete victim; } } private: SmiRecordFile* file; LRU lru; size_t slotSize; }; } // end of namespace appendcache /* 4 Class InvertedFile */ template class InvertedFileT: public TrieType { public: /* ~Standard Constructor~ */ InvertedFileT(): TrieType(), listFile(false,0,false),ignoreCase(false), minWordLength(1),stopWordsId(0), memStopWords(0) { listFile.Create(); separators = getDefaultSeparators(); } /* ~Copy Constructor~ */ InvertedFileT(const InvertedFileT& src): TrieType(src), listFile(src.listFile), ignoreCase(src.ignoreCase), minWordLength(src.minWordLength), stopWordsId(src.stopWordsId), memStopWords(0), separators(src.separators) { readStopWordsFromDisk(); } /* ~Constructor~ */ InvertedFileT(SmiFileId& _trieFileId, SmiRecordId& _trieRootId, SmiFileId& _listFileId, const bool _ignoreCase, uint32_t _minWordLength, SmiRecordId& _stopWordsId, const std::string& _separators): TrieType(_trieFileId, _trieRootId), listFile(false), ignoreCase(_ignoreCase), minWordLength(_minWordLength), stopWordsId(_stopWordsId), memStopWords(0), separators(_separators) { listFile.Open(_listFileId); readStopWordsFromDisk(); } /* ~Destructor~ */ ~InvertedFileT(){ if(listFile.IsOpen()){ listFile.Close(); } if(memStopWords){ delete memStopWords; } } /* ~deleteFiles~ Destroys all underlying files. */ void deleteFiles(){ TrieType::deleteFile(); if(listFile.IsOpen()){ listFile.Close(); } listFile.Drop(); } /* ~clone~ Creates a depth copy of this objects. Not implemented yet. */ InvertedFileT* clone(){ InvertedFileT* res = new InvertedFileT(); TrieIteratorType* it = getEntries(""); TrieNodeCacheType* cache = createTrieCache(1048576); std::string word; SmiRecordId id; TrieNodeType resnode; SmiRecordId resTrieId; SmiRecordId resListId; res->separators = this->separators; size_t bufferSize = 512*1024; char* buffer = new char[bufferSize]; while(it->next(word,id)){ res->getInsertNode( word, resnode, resTrieId); SmiRecord resRecord; res->listFile.AppendRecord(resListId, resRecord); resnode.setContent(resListId); resnode.writeToFile(&(res->file), resTrieId); // copy record content SmiRecord srcRecord; listFile.SelectRecord(id,srcRecord); size_t offset = 0; size_t size = srcRecord.Size(); resRecord.Resize(size); while(offset=minWordLength){ std::string text2 = text; if(ignoreCase){ stringutils::toLower(text2); } if(memStopWords==0 || !memStopWords->contains(text2)){ insert(text2,tid,0,0,cache,triecache); } } return; } stringutils::StringTokenizer st(text,separators); wordPosType wc = 0; charPosType pos = 0; while(st.hasNextToken()){ pos = st.getPos(); std::string token = st.nextToken(); if(token.length()>=minWordLength){ if(ignoreCase){ stringutils::toLower(token); } if(memStopWords==0 || !memStopWords->contains(token)){ insert(token, tid, wc, pos, cache, triecache); wc++; } } } } /* ~insert String~ This operator inserts a single word at a specific position. */ void insertString(TupleId tid, const std::string& word, const wordPosType wp, const charPosType cp, appendcache::RecordAppendCache* cache=0, TrieNodeCacheType* triecache = 0){ if(word.size() < minWordLength){ // word too short return; } std::string copy = word; if(ignoreCase){ stringutils::toLower(copy); } if(memStopWords==0 || !memStopWords->contains(copy)){ insert(copy,tid, wp, cp, cache, triecache); } } /* ~getListFileId~ Returns the fileId of the file containing the inverted lists. */ SmiFileId getListFileId() { return listFile.GetFileId(); } /* 5.3.2 Class exactIterator The class can be used for iterating over a single inverted list. */ class exactIterator { friend class InvertedFileT; public: /* Function ~next~ This function returns the next position by setting the arguments. If no more entries are available, the result of this function is __false__. */ bool next(TrieContentType& id, wordPosType& wc, charPosType& cc){ if(!record){ // no record available return false; } if(done){ // finished return false; } if(slotPos >= slotsInMem){ // buffer exhausted done = readPartition(); } if(done){ // no further slots return false; } size_t offset = slotSize*slotPos; memcpy(&id,buffer+offset, sizeof(TrieContentType)); offset += sizeof(TrieContentType); memcpy(&wc,buffer+offset, sizeof(wordPosType)); offset += sizeof(wordPosType); memcpy(&cc,buffer+offset, sizeof(charPosType)); slotPos++; count++; return true; } /* ~Destructor~ */ ~exactIterator(){ if(record){ record->Finish(); delete record; } if(buffer){ delete[] buffer; } } private: size_t part; // partition within the record size_t slotPos; // position within the slot SmiRecord* record; // record containing the values size_t slotsInMem; // currently available slots in memory size_t maxSlotsInMem; // maximum number of slots in memory size_t slotsInRecord; // slots available in record char* buffer; // memory buffer bool done; // true if record is exhausted size_t slotSize; // size of a single slot size_t count; // number of returned results /* ~Constructor~ This constructor will create an iterator returning no entry. */ exactIterator(){ done = true; buffer = 0; record = 0; } /* ~Constructor~ This constructor will create an iterator iterating over the entries of the specified record. */ exactIterator(SmiRecordFile* f, SmiRecordId id, const size_t _mem): part(0), slotPos(0), record(0){ count = 0; slotSize = sizeof(TrieContentType) + sizeof(wordPosType) + sizeof(charPosType); maxSlotsInMem = _mem / slotSize; if(maxSlotsInMem<1){ maxSlotsInMem = 1; } if(id!=0){ record=new SmiRecord(); f->SelectRecord(id, *record); } else { buffer = 0; return; } slotsInRecord = record->Size() / slotSize; size_t buffersize = maxSlotsInMem * slotSize; buffer = new char[buffersize]; slotsInMem = 0; done = readPartition(); } /* ~readPartition~ Reads the next part of the record into the memory buffer. */ // transfer data from record to memory bool readPartition(){ int64_t processed = part * maxSlotsInMem; int64_t availableSlots = (int64_t)slotsInRecord - processed; if(availableSlots <= 0){ return true; } size_t readSlots = std::min((size_t) availableSlots, maxSlotsInMem); record->Read(buffer, readSlots*slotSize , part*maxSlotsInMem*slotSize); part++; slotsInMem = readSlots; slotPos = 0; // nothing read from current mem return false; } }; // end of class ExactIterator /* ~getExactIterator~ This function returns an iterator for a specified word. */ exactIterator* getExactIterator(std::string str, const size_t mem){ if(ignoreCase){ stringutils::toLower(str); } // find the node for str SmiRecordId id = rootId; size_t pos = 0; while((id!=0) && (pos ; public: /* ~next~ Returns the next entry of this iterator if present. If not, the arguments keep unchanged and the return value if false. */ bool next(std::string& word, TrieContentType& tid, wordPosType& wc, charPosType& cc){ while(true){ if(exactIt==0){ // create a new exactIterator if(!it->next(str,id)){ return false; } exactIt = inv->getExactIterator(id, 4096); } if(exactIt){ if(!exactIt->next(tid,wc,cc)){ delete exactIt; exactIt=0; } else { word = str; return true; } } } } ~prefixIterator(){ if(it){ delete it; } if(exactIt){ delete exactIt; } } private: InvertedFileT* inv; TrieIteratorType* it; exactIterator* exactIt; SmiRecordId id; std::string str; /* ~constructor~ */ prefixIterator(InvertedFileT* _inv, const std::string& prefix): inv(_inv){ it = inv->getEntries(prefix); exactIt = 0; id = 0; } }; // end of class PrefixIterator /* ~getPrefixIterator~ Returns a prefixIterator for str. The caller of this functions is responsible to destroy the iterator after using. */ prefixIterator* getPrefixIterator(std::string str){ if(ignoreCase){ stringutils::toLower(str); } return new prefixIterator(this, str); } /* Class countPrefixIterator This iterator returns all words starting with a certain prefix stored in this structure together with the count of this word. */ class countPrefixIterator{ friend class InvertedFileT; public: bool next(std::string& word, size_t& count){ TrieContentType id; if(!it->next(word, id)){ return false; } count = inv->wordCount(id); return true; } ~countPrefixIterator(){ delete it; } private: InvertedFileT* inv; TrieIteratorType* it; countPrefixIterator(InvertedFileT* _inv, const std::string& prefix): inv(_inv) { it = inv->getEntries(prefix); } }; /* ~getCountPrefixIterator~ Returns a countPrefixIterator of this for a specified prefix. */ countPrefixIterator* getCountPrefixIterator( std::string str){ if(ignoreCase){ stringutils::toLower(str); } return new countPrefixIterator(this, str); } /* ~getFileInfo~ Returns data about the underlying files. */ void getFileInfo( SmiStatResultType& result){ TrieType::getFileInfo(result); SmiStatResultType listresult = listFile.GetFileStatistics(SMI_STATS_LAZY); listresult.push_back( std::pair( "FilePurpose", "Inverted List File")); result.push_back(std::pair("---","---")); for(unsigned int i=0;i\"$§&/[]{}=´`@€~'#|"; } bool isEmpty() const{ return rootId == 0; } const std::string getSeparators() const{ return separators; } void setParams(const bool ignoreCase, const uint32_t minWordLength, const std::string& stopWords){ setParams(ignoreCase, minWordLength,stopWords, getDefaultSeparators()); } void setParams(const bool ignoreCase, const uint32_t minWordLength, const std::string& stopWords, const std::string& separators){ assert(rootId==0); // allow to change parameter only for an empty index this->ignoreCase = ignoreCase; this->minWordLength = minWordLength; this->separators = separators; // create the set of stopWords if(memStopWords){ memStopWords->clear(); } else { memStopWords = new mmtrie::Trie(); } stringutils::StringTokenizer st(stopWords, getDefaultSeparators()); while(st.hasNextToken()){ std::string token = st.nextToken(); if(ignoreCase){ stringutils::toLower(token); } if(token.length()>=minWordLength){ memStopWords->insert(token); } } writeStopWordsToDisk(); } bool getIgnoreCase() const{ return ignoreCase; } uint32_t getMinWordLength() const{ return minWordLength; } SmiRecordId getStopWordsId() const{ return stopWordsId; } int getNoEntries() { SmiStatResultType fileinfo; getFileInfo(fileinfo); return std::stoi(fileinfo[28].second) - 1; } private: SmiRecordFile listFile; bool ignoreCase; uint32_t minWordLength; SmiRecordId stopWordsId; mmtrie::Trie* memStopWords; std::string separators; /* ~insert~ inserts a new element into this inverted file */ void insert(const std::string& word, const TupleId tid, const wordPosType wordPos, const charPosType pos, appendcache::RecordAppendCache* cache, TrieNodeCacheType* triecache){ SmiRecordId listId; SmiRecord record; // record containing the list SmiRecordId recordId; // id of the record TrieNodeType insertNode; SmiRecordId insertId; bool isNew; if(triecache){ isNew = TrieType::getInsertNode(word, insertNode, insertId, triecache); } else { isNew = TrieType::getInsertNode(word, insertNode, insertId); } if(insertNode.getContent()==0){ listFile.AppendRecord(listId, record); if(!triecache){ insertNode.setContent(listId); insertNode.writeToFile(&file, insertId); } else { triecache->getNode(insertId)->setContent(listId); } recordId = listId; } else { assert(!isNew); if(cache==0){ listFile.SelectRecord(insertNode.getContent(), record, SmiFile::Update); } recordId = insertNode.getContent(); } size_t buffersize = sizeof(TupleId) + sizeof(wordPosType) + sizeof(charPosType); char buffer[buffersize]; size_t offset=0; memcpy(buffer,&tid, sizeof(TupleId)); offset += sizeof(TupleId); memcpy(buffer + offset, &wordPos, sizeof(wordPosType)); offset += sizeof(wordPosType); memcpy(buffer + offset, &pos, sizeof(charPosType)); if(cache==0){ size_t recordOffset = record.Size(); record.Write(buffer, buffersize, recordOffset); } else { record.Finish(); cache->append(recordId, buffer, buffersize); } } void writeStopWordsToDisk(){ if(!memStopWords){ return; } std::stringstream allss; memStopWords->print(allss); std::string all = allss.str(); const char* buffer = all.c_str(); SmiRecord record; if(stopWordsId==0){ listFile.AppendRecord(stopWordsId, record); } else { listFile.SelectRecord(stopWordsId,record); } record.Resize(all.length()); record.Write(buffer, all.length(), 0); } void readStopWordsFromDisk(){ SmiSize length; if(stopWordsId==0){ if(memStopWords){ delete memStopWords; } memStopWords=0; return; } char* buffer = listFile.GetData(stopWordsId,length, true); std::string str(buffer,length); if(memStopWords==0){ memStopWords = new mmtrie::Trie(); } else { memStopWords->clear(); } stringutils::StringTokenizer st(str," "); while(st.hasNextToken()){ memStopWords->insert(st.nextToken()); } free(buffer); } }; typedef InvertedFileT InvertedFile; class InvFileInsertLI { public: InvFileInsertLI(Stream &s, InvertedFile *i, int i1, int i2, size_t mem) : stream(s), inv(i), textIndex(i1), tidIndex(i2) { size_t trieCacheSize = mem / 20; if (trieCacheSize < 4096) { trieCacheSize = 4096; } size_t invFileCacheSize; if (trieCacheSize + 4096 > mem) { invFileCacheSize = 4096; } else { invFileCacheSize = mem - trieCacheSize; } cache = inv->createAppendCache(invFileCacheSize); trieCache = inv->createTrieCache(trieCacheSize); stream.open(); } ~InvFileInsertLI() { if (cache) { delete cache; } if (trieCache) { delete trieCache; } stream.close(); } Tuple* nextTuple() { Tuple *tuple = stream.request(); if (tuple) { FText* text = (FText*)tuple->GetAttribute(textIndex); TupleIdentifier* tid = (TupleIdentifier*)tuple->GetAttribute(tidIndex); if (text->IsDefined() && tid->IsDefined()) { inv->insertText(tid->GetTid(), text->GetValue(), cache, trieCache); } } return tuple; } private: Stream stream; InvertedFile *inv; appendcache::RecordAppendCache* cache; TrieNodeCacheType* trieCache; int textIndex; int tidIndex; }; namespace triealg{ Word CreateInvfile(const ListExpr typeInfo); void DeleteInvfile( const ListExpr typeInfo, Word& w ); template bool SaveInvfile(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value) { InvertedFileT* t = static_cast*>(value.addr); SmiFileId triefileId = t->getFileId(); valueRecord.Write(&triefileId, sizeof(SmiFileId), offset); offset += sizeof(SmiFileId); SmiRecordId rootId = t->getRootId(); valueRecord.Write(&rootId, sizeof(SmiRecordId), offset); offset += sizeof(SmiRecordId); SmiFileId listFileId = t->getListFileId(); valueRecord.Write(&listFileId, sizeof(SmiFileId), offset); offset += sizeof(SmiFileId); bool ignoreCase = t->getIgnoreCase(); valueRecord.Write(&ignoreCase, sizeof(bool),offset); offset += sizeof(bool); uint32_t minWordLength = t->getMinWordLength(); valueRecord.Write(&minWordLength, sizeof(uint32_t), offset); offset += sizeof(uint32_t); SmiRecordId stopWordsId = t->getStopWordsId(); valueRecord.Write(&stopWordsId, sizeof(SmiRecordId), offset); offset += sizeof(SmiRecordId); std::string separators = t->getSeparators(); size_t separatorsLength = separators.length(); valueRecord.Write(&separatorsLength, sizeof(size_t), offset); offset+= sizeof(size_t); const char* seps = separators.c_str(); valueRecord.Write(seps,separatorsLength, offset); offset += separatorsLength; return true; } template bool OpenInvfile(SmiRecord& valueRecord, size_t& offset, const ListExpr typeInfo, Word& value) { SmiFileId triefileid; valueRecord.Read(&triefileid, sizeof(SmiFileId), offset); offset += sizeof(SmiFileId); SmiRecordId trierid; valueRecord.Read(&trierid, sizeof(SmiRecordId), offset); offset += sizeof(SmiRecordId); SmiFileId listfileid; valueRecord.Read(&listfileid, sizeof(SmiFileId), offset); offset += sizeof(SmiFileId); bool ignoreCase; valueRecord.Read(&ignoreCase, sizeof(bool), offset); offset += sizeof(bool); uint32_t minWordLength; valueRecord.Read(&minWordLength, sizeof(uint32_t), offset); offset += sizeof(uint32_t); SmiRecordId stopWordsId; valueRecord.Read(&stopWordsId, sizeof(SmiRecordId), offset); offset += sizeof(SmiRecordId); size_t separatorsLength; valueRecord.Read(&separatorsLength, sizeof(size_t), offset); offset += sizeof(size_t); char sepBuffer[separatorsLength]; valueRecord.Read(sepBuffer, separatorsLength, offset); offset += separatorsLength; std::string separators(sepBuffer, separatorsLength); InvertedFileT* invFile = new InvertedFileT(triefileid, trierid, listfileid, ignoreCase, minWordLength, stopWordsId, separators); value.setAddr(invFile); return true; } }