Files
secondo/Algebras/Trie/TrieAlgebra.cpp

1363 lines
37 KiB
C++
Raw Normal View History

2026-01-23 17:03:45 +08:00
/*
----
This file is part of SECONDO.
Copyright (C) 2012, University in Hagen
Faculty of Mathematic and Computer Science,
Database Systems for New Applications.
SECONDO is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
SECONDO is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SECONDO; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
----
*/
#include "InvertedFile.h"
#include "NestedList.h"
#include "ListUtils.h"
#include "QueryProcessor.h"
#include "StandardTypes.h"
#include "Algebras/Relation-C++/RelationAlgebra.h"
#include "Algebras/FText/FTextAlgebra.h"
#include "Stream.h"
#include "Progress.h"
extern NestedList* nl;
extern QueryProcessor* qp;
using namespace std;
namespace triealg{
/*
1.2 Type Constructor InvFile
*/
ListExpr InvfileProperty(){
return nl->TwoElemList(
nl->FiveElemList(
nl->StringAtom("Signature"),
nl->StringAtom("Example Type List"),
nl->StringAtom("List Rep"),
nl->StringAtom("Example List"),
nl->StringAtom("Remarks")),
nl->FiveElemList(
nl->TextAtom(" -> SIMPLE"),
nl->TextAtom("invfile"),
nl->TextAtom("invfile"),
nl->TextAtom("( (a 1))"),
nl->TextAtom("test type constructor"))
);
}
bool CheckInvfile(ListExpr type, ListExpr& ErrorInfo){
return InvertedFile::checkType(type);
}
ListExpr OutInvfile(ListExpr typeInfo, Word value){
return nl->TextAtom("An invfile");
}
Word InInvfile(ListExpr typeInfo, ListExpr value,
int errorPos, ListExpr& errorInfo, bool& correct){
Word w;
w.addr = 0;
correct = false;
return w;
}
Word CreateInvfile(const ListExpr typeInfo){
Word res;
res.addr = new InvertedFile();
return res;
}
void DeleteInvfile( const ListExpr typeInfo, Word& w ){
InvertedFile* t = (InvertedFile*) w.addr;
t->deleteFiles();
delete t;
w.addr = 0;
}
void CloseInvfile( const ListExpr typeInfo, Word& w ){
InvertedFile* t = (InvertedFile*) w.addr;
delete t;
w.addr = 0;
}
Word CloneInvfile(const ListExpr typeInfo, const Word& value){
InvertedFile* src = (InvertedFile*) value.addr;
return src->clone();
}
void* CastInvfile( void* addr) {
return (InvertedFile*) addr;
}
int SizeOfInvfile(){
return sizeof(InvertedFile);
}
TypeConstructor invfiletc( InvertedFile::BasicType(),
InvfileProperty,
OutInvfile,
InInvfile,
0,
0,
CreateInvfile,
DeleteInvfile,
OpenInvfile<uint32_t, uint32_t>,
SaveInvfile<uint32_t, uint32_t>,
CloseInvfile,
CloneInvfile,
CastInvfile,
SizeOfInvfile,
CheckInvfile );
/*
2.5 Operator createInvFile
2.5.1 Type Mapping
Signature is stream(tuple) x a1 x a2 -> invfile
a1 must be of type text
a2 must be of type tid
*/
ListExpr createInvFileTM(ListExpr args){
string err = "stream(tuple) x a_i x a_j [ x bool x int x text"
" [x string] ]expected";
if( !nl->HasLength(args,3) && !nl->HasLength(args,6)
&& !nl->HasLength(args,7)){
return listutils::typeError(err + " (wrong number of arguments)");
}
if(!Stream<Tuple>::checkType(nl->First(args))){
return listutils::typeError(err + " (first arg is not a tuple stream)");
}
if(!listutils::isSymbol(nl->Second(args)) ||
!listutils::isSymbol(nl->Third(args))){
return listutils::typeError(err +
" (one of the attribute names is not valid)");
}
ListExpr attrList = nl->Second(nl->Second(nl->First(args)));
string a1 = nl->SymbolValue(nl->Second(args));
string a2 = nl->SymbolValue(nl->Third(args));
ListExpr t1;
ListExpr t2;
int i1 = listutils::findAttribute(attrList,a1,t1);
if(i1==0){
return listutils::typeError("Attribute " + a1 +
" not known in the tuple");
}
int i2 = listutils::findAttribute(attrList,a2,t2);
if(i2==0){
return listutils::typeError("Attribute " + a2 +
" not known in the tuple");
}
if(!FText::checkType(t1)){
return listutils::typeError(a1 + " not of type text");
}
if(!TupleIdentifier::checkType(t2)){
return listutils::typeError(a2 + " not of type " +
TupleIdentifier::BasicType());
}
ListExpr appendList = nl->TwoElemList( nl->IntAtom(i1-1),
nl->IntAtom(i2-1));
if(nl->HasLength(args,3)){
ListExpr defaultParamList = nl->FourElemList(
nl->BoolAtom(false),
nl->IntAtom(1),
nl->TextAtom(""),
nl->TwoElemList(
listutils::basicSymbol<CcString>(),
listutils::getUndefined())
);
appendList = listutils::concat(defaultParamList, appendList);
} else {
ListExpr ignoreCase = nl->Fourth(args);
ListExpr minLength = nl->Fifth(args);
ListExpr stopWords = nl->Sixth(args);
if(!CcBool::checkType(ignoreCase) ||
!CcInt::checkType(minLength) ||
!FText::checkType(stopWords)){
return listutils::typeError(err);
}
if(nl->HasLength(args,7)){
ListExpr separators = nl->Seventh(args);
if(!CcString::checkType(separators)){
return listutils::typeError(err +
" (7. arg is not of type string)");
}
} else {
ListExpr defaultParamList = nl->OneElemList(
nl->TwoElemList( listutils::basicSymbol<CcString>(),
listutils::getUndefined()));
appendList = listutils::concat(defaultParamList, appendList);
}
}
return nl->ThreeElemList( nl->SymbolAtom(Symbols::APPEND()),
appendList,
nl->SymbolAtom(InvertedFile::BasicType()));
}
/*
2.5.2 Value Mapping
*/
int createInvFileVM(Word* args, Word& result, int message,
Word& local, Supplier s){
switch(message){
case OPEN:
case CLOSE:
case REQUEST: {
Stream<Tuple> stream(args[0]);
int textIndex = ((CcInt*)args[7].addr)->GetValue();
int tidIndex = ((CcInt*)args[8].addr)->GetValue();
result = qp->ResultStorage(s);
InvertedFile* invFile = (InvertedFile*) result.addr;
bool ignoreCase = false;
CcBool* ic = (CcBool*) args[3].addr;
if(ic->IsDefined()){
ignoreCase = ic->GetValue();
}
int minWL = 1;
CcInt* wl = (CcInt*) args[4].addr;
if(wl->IsDefined() && wl->GetValue()>0){
minWL = wl->GetValue();
}
string stopWords = "";
FText* sw = (FText*) args[5].addr;
if(sw->IsDefined()){
stopWords = sw->GetValue();
}
CcString* separators = (CcString*) args[6].addr;
if(separators->IsDefined()){
invFile->setParams(ignoreCase, minWL, stopWords,
separators->GetValue());
} else {
invFile->setParams(ignoreCase, minWL, stopWords,
InvertedFile::getDefaultSeparators());
}
stream.open();
Tuple* tuple;
size_t maxMem = qp->GetMemorySize(s) * 1024*1024;
size_t trieCacheSize = maxMem / 20;
if(trieCacheSize < 4096){
trieCacheSize = 4096;
}
size_t invFileCacheSize;
if(trieCacheSize + 4096 > maxMem){
invFileCacheSize = 4096;
} else {
invFileCacheSize = maxMem - trieCacheSize;
}
appendcache::RecordAppendCache* cache =
invFile->createAppendCache(invFileCacheSize);
TrieNodeCacheType* trieCache =
invFile->createTrieCache(trieCacheSize);
while( (tuple = stream.request())!=0){
FText* text = (FText*) tuple->GetAttribute(textIndex);
TupleIdentifier* tid = (TupleIdentifier*)
tuple->GetAttribute(tidIndex);
if(text->IsDefined() && tid->IsDefined()){
invFile->insertText(tid->GetTid() , text->GetValue(),
cache, trieCache);
}
tuple->DeleteIfAllowed();
}
stream.close();
delete cache;
delete trieCache;
return 0;
}
case REQUESTPROGRESS: {
ProgressInfo p1;
ProgressInfo* pRes;
pRes = (ProgressInfo*) result.addr;
if ( qp->RequestProgress(args[0].addr, &p1) ) {
pRes->Copy(p1);
return YIELD;
} else {
return CANCEL;
}
}
case CLOSEPROGRESS: {
return 0;
}
}
return 0;
}
/*
2.5.3 Specification
*/
const string createInvFileSpec =
"( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" \"Comment\" ) "
"(<text> stream(tuple(...) x a_i x a_j "
"[[ ignoreCase, minWordLength, stopWords], separators]-> invfile </text--->"
"<text> _ createInvFile[_, _, _, _ , _,_] </text--->"
"<text>creates an inverted file from a stream. "
" a_i must be of type text, a_j must be of type tid."
" The last three arguments are optionally."
" If ignoreCase is set to true, upper and lower case is ignored."
" minWordLength is of type int and describes the minimum word length"
" for indexing (default 1)."
"Stopwords is a text containing words which not should be indexed."
"The separators argument specifies all characters which are to use as "
"word separators. For the default, use the defaultInvFileSeparators "
"operator."
"</text--->"
"<text>query SEC2OPERATORINFO feed addid "
"createInvFile[Signature, TID] </text--->"
"<text></text--->"
") )";
Operator createInvFile (
"createInvFile" , // name
createInvFileSpec, // specification
createInvFileVM, // value mapping
Operator::SimpleSelect, // trivial selection function
createInvFileTM);
/*
\subsection{Operator insertInvFile}
\subsubsection{Type Mapping}
Signature is stream(tuple) x invfile x a1 x a2 -> stream(tuple)
a1 must be of type text
a2 must be of type tid
*/
ListExpr insertInvFileTM(ListExpr args){
string err = "stream(tuple) x invfile x a_i x a_j expected";
if (!nl->HasLength(args, 4)) {
return listutils::typeError(err + " (wrong number of arguments)");
}
if (!Stream<Tuple>::checkType(nl->First(args))) {
return listutils::typeError(err + " (first arg is not a tuple stream)");
}
if (!InvertedFile::checkType(nl->Second(args))) {
return listutils::typeError(err + " (second arg is not an inverted file)");
}
if (!listutils::isSymbol(nl->Third(args)) ||
!listutils::isSymbol(nl->Fourth(args))) {
return listutils::typeError(err +
" (one of the attribute names is invalid)");
}
ListExpr attrList = nl->Second(nl->Second(nl->First(args)));
string a1 = nl->SymbolValue(nl->Third(args));
string a2 = nl->SymbolValue(nl->Fourth(args));
ListExpr t1;
ListExpr t2;
int i1 = listutils::findAttribute(attrList, a1, t1);
if (i1==0) {
return listutils::typeError("Attribute " + a1 + " not found in the tuple");
}
int i2 = listutils::findAttribute(attrList, a2, t2);
if (i2==0) {
return listutils::typeError("Attribute " + a2 + " not known in the tuple");
}
if(!FText::checkType(t1)){
return listutils::typeError(a1 + " not of type text");
}
if(!TupleIdentifier::checkType(t2)){
return listutils::typeError(a2 + " not of type " +
TupleIdentifier::BasicType());
}
return nl->ThreeElemList(nl->SymbolAtom(Symbols::APPEND()),
nl->TwoElemList(nl->IntAtom(i1 - 1),
nl->IntAtom(i2 - 1)),
nl->First(args));
}
/*
\subsubsection{Value Mapping}
*/
int insertInvFileVM(Word* args, Word& result, int message,
Word& local, Supplier s) {
InvFileInsertLI *li = (InvFileInsertLI*)local.addr;
switch (message) {
case OPEN: {
if (li) {
delete li;
local.addr = 0;
}
Stream<Tuple> stream = (Stream<Tuple>)args[0].addr;
InvertedFile *inv = (InvertedFile*)args[1].addr;
int textIndex = ((CcInt*)args[4].addr)->GetValue();
int tidIndex = ((CcInt*)args[5].addr)->GetValue();
size_t maxMem = qp->GetMemorySize(s) * 1024*1024;
li = new InvFileInsertLI(stream, inv, textIndex, tidIndex, maxMem);
qp->SetModified(qp->GetSon(s, 1));
local.addr = li;
return 0;
}
case REQUEST: {
result.addr = li ? li->nextTuple() : 0;
return result.addr ? YIELD : CANCEL;
}
case CLOSE: {
if (li) {
delete li;
local.addr = 0;
}
}
case REQUESTPROGRESS: {
ProgressInfo p1;
ProgressInfo* pRes;
pRes = (ProgressInfo*) result.addr;
if (qp->RequestProgress(args[0].addr, &p1)) {
pRes->Copy(p1);
return YIELD;
}
else {
return CANCEL;
}
}
case CLOSEPROGRESS: {
return 0;
}
}
return 0;
}
/*
2.5.3 Specification
*/
const string insertInvFileSpec =
"( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" \"Comment\" ) "
"(<text> stream(tuple(...)) x invfile x a_i x a_j -> stream(tuple(...))"
"</text--->"
"<text> _ _ insertInvFile[ _, _ ] </text--->"
"<text>inserts an attribute of a tuple stream into an inverted file."
" a_i must be of type text, a_j must be of type tid."
"</text--->"
"<text></text--->"
"<text></text--->"
") )";
Operator insertInvFile (
"insertInvFile" , // name
insertInvFileSpec, // specification
insertInvFileVM, // value mapping
Operator::SimpleSelect, // trivial selection function
insertInvFileTM);
/*
2.6 Operator searchWord
2.6.1 Type Mapping
Signature : invfile x (string|text) ->
stream(tuple([Tid : tid, WordPos : int, CharPos : int]))
*/
ListExpr searchWordTM(ListExpr args){
string err = "invfile x (string | text) expected" ;
if(!nl->HasLength(args,2)){
return listutils::typeError(err);
}
if(!InvertedFile::checkType(nl->First(args)) ||
(!CcString::checkType(nl->Second(args)) &&
!FText::checkType(nl->Second(args)))) {
return listutils::typeError(err);
}
ListExpr attrList = nl->ThreeElemList(
nl->TwoElemList( nl->SymbolAtom("Tid"),
nl->SymbolAtom(TupleIdentifier::BasicType())),
nl->TwoElemList( nl->SymbolAtom("WordPos"),
nl->SymbolAtom(CcInt::BasicType())),
nl->TwoElemList( nl->SymbolAtom("CharPos"),
nl->SymbolAtom(CcInt::BasicType()))
);
return nl->TwoElemList( nl->SymbolAtom(Stream<Tuple>::BasicType()),
nl->TwoElemList( nl->SymbolAtom(Tuple::BasicType()),
attrList));
}
/*
2.6.1 LocalInfo
*/
class searchWordLocalInfo{
public:
searchWordLocalInfo( InvertedFile* inv, string word,
ListExpr typeList, size_t mem){
tt = new TupleType(typeList);
it = inv->getExactIterator(word, mem);
}
~searchWordLocalInfo(){
tt->DeleteIfAllowed();
delete it;
}
Tuple* next(){
TupleId id;
uint32_t wp;
uint32_t cp;
if(it->next(id,wp,cp)){
Tuple* res = new Tuple(tt);
res->PutAttribute(0, new TupleIdentifier(true,id));
res->PutAttribute(1, new CcInt(true,wp));
res->PutAttribute(2, new CcInt(true, cp));
return res;
}
return 0;
}
private:
TupleType* tt;
InvertedFile::exactIterator* it;
};
/*
2.6.2 Value Mapping
*/
template<class T>
int searchWordVM(Word* args, Word& result, int message, Word& local,
Supplier s) {
searchWordLocalInfo* li = (searchWordLocalInfo*) local.addr;
switch(message){
case OPEN : {
if(li){
delete li;
}
InvertedFile* iv = (InvertedFile*) args[0].addr;
T* searchObj = (T*) args[1].addr;
ListExpr type = nl->Second(GetTupleResultType(s));
size_t memBuffer = 4096;
if(searchObj->IsDefined()){
local.addr = new searchWordLocalInfo(iv,
searchObj->GetValue(), type, memBuffer);
}
return 0;
}
case REQUEST : {
if(!li){
return CANCEL;
}
result.addr=li->next();
return result.addr?YIELD:CANCEL;
}
case CLOSE : {
if(li){
delete li;
local.addr = 0;
}
}
}
return -1;
}
ValueMapping searchWordVMs[] = {searchWordVM<CcString>, searchWordVM<FText>};
int searchWordSelect(ListExpr args) {
if (CcString::checkType(nl->Second(args))) {
return 0;
}
if (FText::checkType(nl->Second(args))) {
return 1;
}
return -1;
}
/*
2.6.3 Specification
*/
const string searchWordSpec =
"( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" \"Comment\" ) "
"(<text> invfile x (string | text) -> stream(tuple([TID : tid, "
"WordPos : int, CharPos : int)) </text--->"
"<text> _ searchWord [_] </text--->"
"<text>Retrives the information stored in an inverted file "
" for the given string or text"
"</text--->"
"<text>query SEC2OPERATORINFO feed addid createInvFile[Signature, TID] "
" searchWord[\"string\"] count"
"</text--->"
"<text></text--->"
") )";
Operator searchWord (
"searchWord" , // name
searchWordSpec, // specification
2,
searchWordVMs, // value mapping
searchWordSelect, // selection function
searchWordTM);
/*
2.6 Operator searchPrefix
2.6.1 Type Mapping
Signature : invfile x string ->
stream(tuple([Word : text, Tid : tid, WordPos : int, CharPos : int]))
*/
ListExpr searchPrefixTM(ListExpr args){
string err = "invfile x string expected" ;
if(!nl->HasLength(args,2)){
return listutils::typeError(err);
}
if(!InvertedFile::checkType(nl->First(args)) ||
!CcString::checkType(nl->Second(args))){
return listutils::typeError(err);
}
ListExpr attrList = nl->FourElemList(
nl->TwoElemList( nl->SymbolAtom("Word"),
nl->SymbolAtom(FText::BasicType())),
nl->TwoElemList( nl->SymbolAtom("Tid"),
nl->SymbolAtom(TupleIdentifier::BasicType())),
nl->TwoElemList( nl->SymbolAtom("WordPos"),
nl->SymbolAtom(CcInt::BasicType())),
nl->TwoElemList( nl->SymbolAtom("CharPos"),
nl->SymbolAtom(CcInt::BasicType()))
);
return nl->TwoElemList( nl->SymbolAtom(Stream<Tuple>::BasicType()),
nl->TwoElemList( nl->SymbolAtom(Tuple::BasicType()),
attrList));
}
/*
2.6.1 LocalInfo
*/
class searchPrefixLocalInfo{
public:
searchPrefixLocalInfo( InvertedFile* inv, string word, ListExpr typeList){
tt = new TupleType(typeList);
it = inv->getPrefixIterator(word);
}
~searchPrefixLocalInfo(){
tt->DeleteIfAllowed();
delete it;
}
Tuple* next(){
string word;
TupleId id;
uint32_t wp;
uint32_t cp;
if(it->next(word,id,wp,cp)){
Tuple* res = new Tuple(tt);
res->PutAttribute(0, new FText(true,word));
res->PutAttribute(1, new TupleIdentifier(true,id));
res->PutAttribute(2, new CcInt(true,wp));
res->PutAttribute(3, new CcInt(true, cp));
return res;
}
return 0;
}
private:
TupleType* tt;
InvertedFile::prefixIterator* it;
};
/*
2.6.2 Value Mapping
*/
int searchPrefixVM(Word* args, Word& result, int message,
Word& local, Supplier s){
searchPrefixLocalInfo* li = (searchPrefixLocalInfo*) local.addr;
switch(message){
case OPEN : {
if(li){
delete li;
}
InvertedFile* iv = (InvertedFile*) args[0].addr;
CcString* cstr = (CcString*) args[1].addr;
ListExpr type = nl->Second(GetTupleResultType(s));
if(cstr->IsDefined()){
local.addr = new searchPrefixLocalInfo(iv,
cstr->GetValue(), type);
}
return 0;
}
case REQUEST : {
if(!li){
return CANCEL;
}
result.addr=li->next();
return result.addr?YIELD:CANCEL;
}
case CLOSE : {
if(li){
delete li;
local.addr = 0;
}
}
}
return -1;
}
/*
2.6.3 Specification
*/
const string searchPrefixSpec =
"( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" \"Comment\" ) "
"(<text> invfile x string -> stream(tuple([ Word : string, TID : tid, "
"WordPos : int, CharPos : int)) </text--->"
"<text> _ searchPrefix [_] </text--->"
"<text>Retrieves the information stored in an inverted file "
" for the given prefix"
"</text--->"
"<text>query SEC2OPERATORINFO feed addid createInvFile[Signature, TID] "
" searchPrefix[\"stri\"] count"
"</text--->"
"<text></text--->"
") )";
Operator searchPrefix (
"searchPrefix" , // name
searchPrefixSpec, // specification
searchPrefixVM, // value mapping
Operator::SimpleSelect, // trivial selection function
searchPrefixTM);
/*
2.7 Operator getFileInfo
2.7.1 Type Mapping
The Signature is {trie, invfile} -> text
*/
ListExpr getFileInfoTM(ListExpr args){
string err = " invfile expected " ;
if(!nl->HasLength(args,1)){
return listutils::typeError(err);
}
ListExpr arg = nl->First(args);
if(!InvertedFile::checkType(arg)){
return listutils::typeError(err);
}
return listutils::basicSymbol<FText>();
}
/*
2.7.2 ValueMapping
*/
template<class T>
int getFileInfoVM(Word* args, Word& result, int message,
Word& local, Supplier s){
T* t = (T*) args[0].addr;
result = qp->ResultStorage(s);
FText* res = (FText*) result.addr;
SmiStatResultType r;
t->getFileInfo(r);
stringstream ss;
for(unsigned int i=0; i< r.size() ; i++){
ss << r[i].first << " : " << r[i].second << endl;
}
res->Set(true,ss.str());
return 0;
}
/*
2.7.4 Specification
*/
OperatorSpec getFileInfoSpec(
"{invfile} -> text",
"getFileInfo(_)",
"Returns information about the underlying files of"
" an index structure",
" query getFileInfo(iv1) " );
/*
2.7.5 Operator Instance
*/
Operator getFileInfo
(
"getFileInfo", //name
getFileInfoSpec.getStr(), //specification
getFileInfoVM<InvertedFile>, //value mapping
Operator::SimpleSelect, //trivial selection function
getFileInfoTM //type mapping
);
/*
2.8 wordCount
This operator returns the amount of a certain word within the
whole indexed relation.
2.8.1 typeMapping
Signature : invfile x string -> int
*/
ListExpr wordCountTM(ListExpr args){
string err ="invfile x string expected";
if(!nl->HasLength(args,2)){
return listutils::typeError(err);
}
if(!InvertedFile::checkType(nl->First(args))){
return listutils::typeError(err);
}
if(!CcString::checkType(nl->Second(args))){
return listutils::typeError(err);
}
return listutils::basicSymbol<CcInt>();
}
/*
2.8.2 Value Mapping
*/
int wordCountVM(Word* args, Word& result, int message,
Word& local, Supplier s){
InvertedFile* iv = (InvertedFile*) args[0].addr;
CcString* str = (CcString*) args[1].addr;
result = qp->ResultStorage(s);
CcInt* res = (CcInt*) result.addr;
if(!str->IsDefined()){
res->SetDefined(false);
} else {
res->Set(true, iv->wordCount(str->GetValue()));
}
return 0;
}
/*
2.8.3 Specification
*/
OperatorSpec wordCountSpec(
" invfile x string -> int",
" _ wordCount[_]",
" Returns how ofter a word is indexed.",
" query iv wordCount[\"secondo\" " );
/*
2.8.4 Operator Instance
*/
Operator wordCount (
"wordCount" , // name
wordCountSpec.getStr(), // specification
wordCountVM, // value mapping
Operator::SimpleSelect, // trivial selection function
wordCountTM);
/*
2.9 prefixCount
2.9.1 Type Mapping
Signature : invfile x string -> stream(tuple([Word : text, Count : int]))
*/
ListExpr prefixCountTM(ListExpr args){
string err ="invfile x string expected";
if(!nl->HasLength(args,2)){
return listutils::typeError(err);
}
if(!InvertedFile::checkType(nl->First(args))){
return listutils::typeError(err);
}
if(!CcString::checkType(nl->Second(args))){
return listutils::typeError(err);
}
ListExpr attrList = nl->TwoElemList(
nl->TwoElemList(
nl->SymbolAtom("Word"),
listutils::basicSymbol<FText>()),
nl->TwoElemList(
nl->SymbolAtom("Count"),
listutils::basicSymbol<CcInt>()));
return nl->TwoElemList(
listutils::basicSymbol<Stream<Tuple> >(),
nl->TwoElemList(
listutils::basicSymbol<Tuple>(),
attrList));
}
/*
2.9.2 Value Mapping
*/
class prefixCountLI{
public:
prefixCountLI(InvertedFile* iv, CcString* str, ListExpr type): tt(0),it(0){
if(str->IsDefined()){
it = iv->getCountPrefixIterator(str->GetValue());
tt = new TupleType(type);
}
}
~prefixCountLI(){
if(it){
delete it;
}
if(tt){
tt->DeleteIfAllowed();;
}
}
Tuple* next(){
string s;
size_t c = 0;
if(!it->next(s,c)){
return 0;
}
Tuple* res = new Tuple(tt);
res->PutAttribute(0, new FText(true,s));
res->PutAttribute(1, new CcInt(true,c));
return res;
}
private:
TupleType* tt;
InvertedFile::countPrefixIterator* it;
};
int prefixCountVM(Word* args, Word& result, int message,
Word& local, Supplier s){
prefixCountLI* li = (prefixCountLI*) local.addr;
switch(message){
case OPEN : {
if(li){
delete li;
}
InvertedFile* iv = (InvertedFile*) args[0].addr;
CcString* str = (CcString*) args[1].addr;
local.addr = new prefixCountLI(iv,str,
nl->Second(GetTupleResultType(s)));
return 0;
}
case REQUEST : {
if(!li){
return CANCEL;
}
result.addr = li->next();
return result.addr?YIELD:CANCEL;
}
case CLOSE : {
if(li){
delete li;
local.addr = 0;
}
return 0;
}
}
return -1;
}
/*
9.2.3 Specification
*/
OperatorSpec prefixCountSpec(
" invfile x string -> stream(tuple([Word : text, Count : int]))",
" _ prefixCount[_]",
" Returns how often word starting with a prefix are indexed.",
" query iv prefixCountCount[\"secondo\" ] tconsume " );
/*
9.2.4 Operator instance
*/
Operator prefixCount (
"prefixCount" , // name
prefixCountSpec.getStr(), // specification
prefixCountVM, // value mapping
Operator::SimpleSelect, // trivial selection function
prefixCountTM);
/*
9.3 Operator ~getInvFileSeparators~
Returns all characters used for tokenize the string within the
InvFile data type.
*/
ListExpr getInvFileSeparatorsTM(ListExpr args){
if(!nl->IsEmpty(args)){
return listutils::typeError("no arguments expected");
}
return listutils::basicSymbol<CcString>();
}
int getInvFileSeparatorsVM(Word* args, Word& result, int message,
Word& local, Supplier s){
result = qp->ResultStorage(s);
CcString* res = (CcString*) result.addr;
res->Set(true, InvertedFile::getDefaultSeparators());
return 0;
}
OperatorSpec getInvFileSeparatorsSpec(
" -> string",
" defaultInvFileSeparators()",
" Returns all characters used by Inverted Files to "
"tokenize texts as a single string by default",
" query defaultInvFileSeparators()" );
Operator getInvFileSeparators (
"defaultInvFileSeparators" , // name
getInvFileSeparatorsSpec.getStr(), // specification
getInvFileSeparatorsVM, // value mapping
Operator::SimpleSelect, // trivial selection function
getInvFileSeparatorsTM);
/*
9.4 getSeparators
*/
ListExpr getSeparatorsTM(ListExpr args){
string err = "excpected " + InvertedFile::BasicType();
if(!nl->HasLength(args,1)){
return listutils::typeError(err);
}
if(!InvertedFile::checkType(nl->First(args))){
return listutils::typeError(err);
}
return listutils::basicSymbol<CcString>();
}
int getSeparatorsVM(Word* args, Word& result, int message,
Word& local, Supplier s){
result = qp->ResultStorage(s);
CcString* res = (CcString*) result.addr;
InvertedFile* arg = (InvertedFile*) args[0].addr;
res->Set(true, arg->getSeparators());
return 0;
}
OperatorSpec getSeparatorsSpec(
" invfile -> string",
" getSeparators(_)",
" Returns all characters used by the inverted file to "
"tokenize texts as a single string",
" query getSeparators(inv)" );
Operator getSeparators (
"getSeparators" , // name
getSeparatorsSpec.getStr(), // specification
getSeparatorsVM, // value mapping
Operator::SimpleSelect, // trivial selection function
getSeparatorsTM);
/*
9.5 Operator ~containsWord~ and and ~containsPrefix~
These operators check whether a text or a string contains
a given word or a word prefix.
9.5.1 Type Mapping
These operators get two string or text values and optionally a
boolean value. It checks whether the first argument contains the
word or the word start given by the second argument. The boolean
argument specifies wether the check should be case sensitive. The
default value is false. For separating the first text into words,
the default seperator of the inverted file are used.
*/
ListExpr containsWordOrPrefixTM(ListExpr args){
string err = "{string,text} x {string, text} [ x bool] expected";
if(!nl->HasLength(args,2) && !nl->HasLength(args,3)){
return listutils::typeError(err);
}
if( !CcString::checkType(nl->First(args))
&& !FText::checkType(nl->First(args))){
return listutils::typeError(err);
}
if( !CcString::checkType(nl->Second(args))
&& !FText::checkType(nl->Second(args))){
return listutils::typeError(err);
}
if(nl->HasLength(args,2)){
return nl->ThreeElemList( nl->SymbolAtom(Symbols::APPEND()),
nl->OneElemList(nl->BoolAtom(false)),
listutils::basicSymbol<CcBool>());
}
if(!CcBool::checkType(nl->Third(args))){
return listutils::typeError(err);
}
return listutils::basicSymbol<CcBool>();
}
/*
9.5.2 Value Mapping
*/
template<class T, class W, bool prefix>
int containsWordOrPrefixVM(Word* args, Word& result, int message,
Word& local, Supplier s){
T* text = (T*) args[0].addr;
W* word = (W*) args[1].addr;
CcBool* cs = (CcBool*) args[2].addr;
result = qp->ResultStorage(s);
CcBool* res = (CcBool*) result.addr;
if(!text->IsDefined() || !word->IsDefined() || !cs->IsDefined()){
res->SetDefined(false);
return 0;
}
string t = text->GetValue();
string w = word->GetValue();
bool c = cs->GetValue();
if(!c){
stringutils::toLower(t);
stringutils::toLower(w);
stringutils::trim(w);
}
res->Set(true,false);
if(w.length()==0){ // stupid user found
return 0;
}
stringutils::StringTokenizer st(t,InvertedFile::getDefaultSeparators());
while(st.hasNextToken()){
string nw = st.nextToken();
if(prefix){
if(stringutils::startsWith(nw,w)){
res->Set(true,true);
return 0;
}
} else {
if(nw==w){
res->Set(true,true);
return 0;
}
}
}
return 0;
}
/*
9.5.3 Value Mapping array and Selection
*/
ValueMapping containsWordVM[] = {
containsWordOrPrefixVM<CcString,CcString,false>,
containsWordOrPrefixVM<CcString,FText,false>,
containsWordOrPrefixVM<FText,CcString,false>,
containsWordOrPrefixVM<FText,FText,false>,
};
ValueMapping containsPrefixVM[] = {
containsWordOrPrefixVM<CcString,CcString,true>,
containsWordOrPrefixVM<CcString,FText,true>,
containsWordOrPrefixVM<FText,CcString,true>,
containsWordOrPrefixVM<FText,FText,true>,
};
int containsWordOrPrefixSelect(ListExpr args){
int v1 = CcString::checkType(nl->Second(args))?0:1;
int v2 = CcString::checkType(nl->First(args))?0:2;
return v1+v2;
}
/*
9.5.4 Specififation
*/
OperatorSpec containsWordSpec(
" {string,text} x {string,text} [ x bool] -> bool",
" containsWord(_,_,_)",
" Checks whether the first argument contains a word given "
" by the second argument. The optional third argument determines"
" whether the checking is case sensitive.",
" query containsWord('Secondo is great','Secondo',FALSE)" );
OperatorSpec containsPrefixSpec(
" {string,text} x {string,text} [ x bool] -> bool",
" containsprefix(_,_,_)",
" Checks whether the first argument contains a word starting with "
" the second argument. The optional third argument determines"
" whether the checking is case sensitive.",
" query containsWord('Secondo is great','Secondo',FALSE)" );
/*
9.5.5 Operator instances
*/
Operator containsWordOp(
"containsWord", // operator's name
containsWordSpec.getStr(), // specification
4, // number of Value Mappings
containsWordVM, // value mapping array
containsWordOrPrefixSelect, // selection function
containsWordOrPrefixTM // type mapping
);
Operator containsPrefixOp(
"containsPrefix", // operator's name
containsPrefixSpec.getStr(), // specification
4, // number of Value Mappings
containsPrefixVM, // value mapping array
containsWordOrPrefixSelect, // selection function
containsWordOrPrefixTM // type mapping
);
} // end of namespace triealg
class TrieAlgebra : public Algebra {
public:
TrieAlgebra() : Algebra() {
AddTypeConstructor( &triealg::invfiletc );
triealg::insertInvFile.SetUsesMemory();
AddOperator(&triealg::createInvFile);
triealg::createInvFile.SetUsesMemory();
AddOperator(&triealg::getFileInfo);
AddOperator(&triealg::wordCount);
AddOperator(&triealg::prefixCount);
AddOperator(&triealg::searchWord);
AddOperator(&triealg::searchPrefix);
AddOperator(&triealg::getInvFileSeparators);
AddOperator(&triealg::getSeparators);
AddOperator(&triealg::containsWordOp);
AddOperator(&triealg::containsPrefixOp);
AddOperator(&triealg::insertInvFile);
triealg::insertInvFile.SetUsesMemory();
#ifdef USE_PROGRESS
triealg::createInvFile.EnableProgress();
triealg::insertInvFile.EnableProgress();
#endif
}
};
extern "C"
Algebra*
InitializeTrieAlgebra( NestedList* nlRef, QueryProcessor* qpRef ) {
nl = nlRef;
qp = qpRef;
return (new TrieAlgebra);
}