Files
secondo/Algebras/Web/Web.cpp
2026-01-23 17:03:45 +08:00

5962 lines
139 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
----
This file is part of SECONDO.
Copyright (C) 2004, University in Hagen, Department of Computer Science,
Database Systems for New Applications.
SECONDO is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
SECONDO is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a coplet page1 = [const page value ((html ((instant (10 10 2006 10 27 18)) <file>/home/sopra/secondo/Algebras/Web/bilder.htm</file--->(url ("http"<text>www.myimages.de</text---> <text>/</text---> )))) ((url ("http"<text>Garten-1.jpg</text---> <text>/</text---> ))<file>/home/sopra/secondo/Algebras/Web/Garten-1.jpg</file--->"image/jpeg")( (url ("http" <text>Garten-2.jpg</text---><text>/</text---> ))<file>/home/sopra/secondo/Algebras/Web/Garten-2.jpg</file--->"image/jpeg"))];y of the GNU General Public License
along with SECONDO; if not, write to the Free Software
Foundation, Inc., 59 Templelet page1 = [const page value ((html ((instant (10 10 2006 10 27 18)) <file>/home/sopra/secondo/Algebras/Web/bilder.htm</file--->(url ("http"<text>www.myimages.de</text---> <text>/</text---> )))) ((url ("http"<text>Garten-1.jpg</text---> <text>/</text---> ))<file>/home/sopra/secondo/Algebras/Web/Garten-1.jpg</file--->"image/jpeg")( (url ("http" <text>Garten-2.jpg</text---><text>/</text---> ))<file>/home/sopra/secondo/Algebras/Web/Garten-2.jpg</file--->"image/jpeg"))]; Place, Suite 330, Boston, MA 02111-1307 USA
----
[1] /Web Algebra
November 2006
1 Preliminaries
1.1 Includes
*/
#undef __POS__
#define __POS__ __FILE__ << ".." << __PRETTY_FUNCTION__ << "@" << __LINE__
//#define TRACEON
#ifdef TRACEON
#define __TRACE__ cout << __POS__ << endl;
#else
#define __TRACE__
#endif
//#define _DEBUG_JPS //Enables Debug output used by Joerg Siegel
//#define _DEBUG_JPS_2 //Enables Debug output used by Joerg Siegel
//#define _DEBUG_JPS_3 //Enables Debug output used by Joerg Siegel
#include "Algebra.h"
#include "NestedList.h"
#include "QueryProcessor.h"
#include "StandardTypes.h"
#include "Algebras/FText/FTextAlgebra.h"
#include "Algebras/BinaryFile/BinaryFileAlgebra.h"
#include "Algebras/Relation-C++/RelationAlgebra.h"
#include "Attribute.h"
#include "DateTime.h"
#include "Tools/Flob/DbArray.h"
#include "Tools/Flob/Flob.h"
#include "web.h"
#include "SocketIO.h" //used for web access
#include "Base64.h" //to en-/ decode binary data
#include <stack>
#include <string>
#ifdef SECONDO_WIN32
#include "ClientServer/Win32Socket.h"
#else //Linux
#include "ClientServer/UnixSocket.h"
#endif
extern NestedList* nl;
extern QueryProcessor *qp;
using namespace datetime;
using namespace std;
/*
1.2 Dummy Functions
No dummy function needed.
*/
/*
2.0 needed definitions
*/
/*
2.1 Implementation of WebLex
*/
WebLex::WebLex(std::istream *is) : yyFlexLexer (is) {
switchState=-1;
myin = is;
}
int WebLex::nextToken(){
int symbol=0;
//__TRACE__
symbol=yylex(switchState);
//__TRACE__
switchState=-1;
tokenVal= YYText();
if (tokenVal.length() == 0)
return symbol;
if (tokenVal[0]=='"' && tokenVal[tokenVal.length()-1]=='"'){
if (tokenVal.length() > 2){
tokenVal.erase(0,1);
tokenVal.erase(tokenVal.length()-1);
}else{
tokenVal="";
}
}
return symbol;
}
void WebLex::switchStartCond(int ns){
switchState=ns;
}
string WebLex::getVal() {
return tokenVal;
}
int WebLex::yylex(){return 0;}
int WebLex::startElement (string& element){
int symbol=0;
switchStartCond(FINDELEMSTART);
symbol=nextToken();
//cout << "-" << getVal() << endl;
while (symbol == SEARCH_ELEMENT_START){
//cout << "-" << getVal() << endl;
symbol=nextToken();
}
//cout << "ENDE startelement " << getVal() << endl;
element= getVal();
if (symbol){
return symbol;
}
return 0;
}
/*
in: attribute
out: value
return: true if ~attribute~ was found in input stream, false otherweise
Looking for the attribute in the input stream of WebLex. Param ~value~ contains the value of the attribute
*/
bool WebLex::findAttribute(string attribute, string& value){
value="";
int symbol;
__TRACE__
symbol=nextToken();
//__TRACE__
while (symbol && symbol != CLOSE_TAG){
if (symbol == ERROR){
cout << "findAttribute Es ist ein Fehler aufgetreten" << endl;
return false;
}
//__TRACE__
//we found an attribute identifier
if (symbol == EIDENTIFIER){
//__TRACE__
//is this the attribute we are looking for?
if (isEqual(getVal(),attribute)){
//cout << "findAttribute Atribut gefunden " << endl;
if (symbol == ERROR){
cout << "Fehler: " << getVal() << endl;
return false;
}
//__TRACE__
symbol=nextToken();
if (symbol == ATTVALUE){
value = getVal();
return true;
}else{
return true;
}
}
}
//__TRACE__
symbol=nextToken();
//__TRACE__
}
return false;
}
/*
in: attributes
out: value
return: true if of of the elements of ~attributes~ was found in input stream, false otherweise
Looking for the attribute in the input stream of WebLex. Param ~value~ contains the value of the attribute
*/
bool WebLex::findAttribute(vector<string>& attributes,
string& value, string& attribute){
value="";
int symbol;
//__TRACE__
symbol=nextToken();
//__TRACE__
while (symbol && symbol != CLOSE_TAG){
if (symbol == ERROR){
cout << "findAttribute Es ist ein Fehler aufgetreten" << endl;
return false;
}
//__TRACE__
//we found an attribute identifier
if (symbol == EIDENTIFIER){
//__TRACE__
//is this the attribute we are looking for?
vector<string>::iterator it = attributes.begin();
while (it != attributes.end()){
//cout << "FINDATTR " << *it << endl;
if (isEqual(*it,getVal())){
attribute=*it;
//cout << "findAttribute Atribut gefunden " << endl;
if (symbol == ERROR){
cout << "Fehler: " << getVal() << endl;
return false;
}
//__TRACE__
symbol=nextToken();
if (symbol == ATTVALUE){
value = getVal();
return true;
}else{
return true;
}
}
it++;
}
}
//__TRACE__
symbol=nextToken();
//__TRACE__
}
return false;
}
/*
Find Position of ~value~ in ~content~ and return flobindex Object
*/
flobindex WebLex::setPos(string value, const string& content){
unsigned long tmp;
flobindex i;
//__TRACE__
i.offset= 0;
i.len=0;
//cout << value << pos << endl;
tmp= (unsigned long) strstr(content.c_str() + pos, value.c_str()) ;
if (!tmp)
return i;
pos = tmp - (unsigned long) content.c_str();
i.offset=pos;
i.len=value.length();
return i;
}
/*
read content of a html element
*/
int WebLex::readContent(){
int symbol=0;
string value="";
// __TRACE__
symbol= nextToken();
//cout << "******** readcontent *********" << endl;
while (symbol == CONTENT){
//cout << getVal() ;
value += getVal();
symbol= nextToken();
}
//cout << "readcontent: " << endl;
if (symbol){
value += getVal();
tokenVal= value;
return CONTENT;
}
tokenVal= value;
return symbol;
}
int WebLex::readContentTmp(){
int symbol=0;
string v="";
__TRACE__
//cout << "**********TMP **************" << endl;
symbol= nextToken();
//cout << getVal() << " " << symbol << endl;
v += getVal();
while (symbol == CONTENT){
symbol= nextToken();
v += getVal();
//cout << ":" << getVal() << " " << symbol << " " << v << endl;
}
//cout << "---" << v << endl;
return 0;
}
/*
2.2 Helping Functions
*/
bool isEqual (string s1, string s2){
transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
transform(s2.begin(), s2.end(), s2.begin(), ::tolower);
return s1 == s2;
}
//Taken from http://www.codeproject.com/string/stringsplit.asp
int SplitString(const string& input,
const string& delimiter, vector<string>& results,
bool includeEmpties)
{
int iPos = 0;
int newPos = -1;
int sizeS2 = (int)delimiter.size();
int isize = (int)input.size();
if(
( isize == 0 )
||
( sizeS2 == 0 )
)
{
return 0;
}
vector<int> positions;
newPos = input.find (delimiter, 0);
if( newPos < 0 )
{
return 0;
}
int numFound = 0;
while( newPos >= iPos )
{
numFound++;
positions.push_back(newPos);
iPos = newPos;
newPos = input.find (delimiter, iPos+sizeS2);
}
if( numFound == 0 )
{
return 0;
}
for( int i=0; i <= (int)positions.size(); ++i )
{
string s("");
if( i == 0 )
{
s = input.substr( i, positions[i] );
}
int offset = positions[i-1] + sizeS2;
if( offset < isize )
{
if( i == (int)positions.size() )
{
s = input.substr(offset);
}
else if( i > 0 )
{
s = input.substr( positions[i-1] + sizeS2,
positions[i] - positions[i-1] - sizeS2 );
}
}
if( includeEmpties || ( s.size() > 0 ) )
{
results.push_back(s);
}
}
return numFound;
}
bool isWhite(char c){
return c == ' ' || c == '\n' || c == '\t';
}
/*
3 l Definitions of ~URL, HTML, Page~
3.1 Class ~URL~
----
Example to create an object:
let url1 = [const url value ("http" <text>//www.google.de</text--->
<text>/</text--->)]
----
*/
class URL : public IndexableAttribute
{
public:
URL();
~URL();
URL(const string&);
URL(const URL&);
URL(const string &prot, const string &h, const string &pp);
bool operator== (const URL& url) const;
void setProtocol(string);
string getProtocol() const;
void setPath(string);
string getPath() const;
void setHost(string);
string getHost() const;
URL* Clone() const;
friend ostream& operator<<(ostream& s, URL u);
ListExpr ToListExpr(bool typeincluded)const;
/* Returns whether this object is defined or not. */
bool IsDefined() const;
/* Sets this object as defined or undefined. */
void SetDefined( bool Defined);
size_t Sizeof() const;
int Compare(const Attribute*) const;
bool Adjacent(const Attribute*) const;
//void operator=(const URL&);
void Set( bool d, URL& u);
void destroy(void);
static bool urlFromString(const string& url,URL& myurl);
inline virtual int NumOfFLOBs() const {__TRACE__ return 2;}
Flob *GetFLOB(const int);
void WriteTo (char*)const;
void ReadFrom(const char*);
SmiSize SizeOfChars(void) const;
size_t HashValue(void) const;
void CopyFrom(const Attribute *arg);
static const string BasicType() { return "url"; }
static const bool checkType(const ListExpr type){
return listutils::isSymbol(type, BasicType());
}
private:
STRING_T protocol;
Flob host;
Flob path;
bool defined;
static bool isValidURL(const string&);
static bool isValidURL(const string&, string&, string&, string&);
};
/*
3.1.1 Implementation of Class-Operations of ~URL~
*/
URL::URL()
{
__TRACE__
}
URL::~URL()
{
// __TRACE__
}
URL::URL(const string& u)
:IndexableAttribute(true),host(0),path(0)
{
// __TRACE__
string p;
string h;
string pa;
if (!isValidURL(u, p, h, pa)){
__TRACE__
defined=false;
return;
}
// __TRACE__
//cout << p << " " << h << " " << pa << endl;
defined = true;
setProtocol (p);
setHost(h);
setPath(pa);
}
URL::URL(const string &prot, const string &h, const string &p)
: IndexableAttribute(true),host(h.length()+1), path(p.length()+2)
//: host(h.length()+1), path(p.length()+1)
{
__TRACE__
if (prot.length() > MAX_STRINGSIZE){
defined=false;
return;
}
__TRACE__
//cout << "*************" << prot + h + p << endl;
if (!isValidURL(prot + "://" + h + p)){
defined=false;
return;
}
__TRACE__
defined = true;
setProtocol (prot);
setHost(h);
setPath(p);
}
URL::URL(const URL& u)
:IndexableAttribute(u.IsDefined()),host(u.getHost().length()+1),
path(u.getPath().length()+1)
{
// __TRACE__
if (!u.IsDefined()){
defined=false;
return;
}
defined=true;
//cout << "url: " << u.getPath() << " " << defined << endl;
setProtocol ( u.getProtocol());
setHost(u.getHost());
setPath(u.getPath());
//cout << "url: " << getPath() << endl;
}
URL* URL::Clone() const
{
__TRACE__
URL *pUrl = new URL(getProtocol(),getHost(),getPath());
return pUrl;
}
string URL::getProtocol() const
{
// __TRACE__
if (!defined)
return "";
return protocol;
}
void URL::setProtocol(string p)
{
// __TRACE__
if (!defined)
return;
if (p.length() <= MAX_STRINGSIZE){
strcpy (protocol, p.c_str());
}
}
string URL::getHost() const
{
// __TRACE__
if (!defined)
return "";
char s[host.getSize()];
host.read(s, host.getSize());
//cout << "getHost " << s << endl;
return string(s);
}
void URL::setHost(string h)
{
// __TRACE__
if (!defined)
return;
//cout << "setHost " << h << endl;
host.resize (h.length() +1);
host.write(h.c_str(),h.length() + 1);
}
string URL::getPath() const
{
// __TRACE__
if (!defined)
return "";
char s[path.getSize()];
path.read(s, path.getSize());
return string(s);
}
void URL::setPath(string p)
{
// __TRACE__
if (!defined)
return;
//cout << "setPath " << p << endl;
if (p.length() == 0)
p= "/";
if (p.at(0) != '/')
p= "/" + p;
path.resize (p.length() +1);
path.write(p.c_str(), p.length() +1);
}
ostream& operator<<(ostream& s, URL u)
{
// __TRACE__
if (!u.IsDefined())
return s << "Value is Undefined";
return s << "URL: [Protocol: " << u.getProtocol() << endl
<< "Host: " << u.getHost() << endl
<< "Path: " << u.getPath() << "]" << endl;
}
ListExpr URL::ToListExpr(bool typeincluded)const {
__TRACE__
ListExpr value;
if( defined )
{
value = nl->ThreeElemList(
nl->StringAtom(getProtocol()),
nl->TextAtom(getHost()),
nl->TextAtom(getPath()));
}
else
value = nl->ThreeElemList(
nl->StringAtom(""),
nl->TextAtom(""),
nl->TextAtom(""));
if(typeincluded)
return nl->TwoElemList(nl->SymbolAtom(URL::BasicType()),value);
else
return value;
}
bool URL::IsDefined() const {
// __TRACE__
return defined;
}
void URL::SetDefined( bool def) {
// __TRACE__
defined = def;
}
size_t URL::Sizeof() const
{
__TRACE__
return sizeof( *this );
}
int URL::Compare(const Attribute*) const
{
__TRACE__
return 0;
}
bool URL::Adjacent(const Attribute*) const
{
__TRACE__
return 0;
}
void URL::Set( bool d, URL& u)
{
__TRACE__
defined = d;
if (!d || !u.IsDefined())
return;
string s = u.getProtocol();
string h = u.getHost();
string p = u.getPath();
__TRACE__
strcpy(protocol, s.c_str());
host.resize( h.length() + 1 );
host.write(h.c_str(), h.length() + 1 );
path.resize( p.length() + 1 );
path.write( p.c_str(), p.length() + 1 );
}
void URL::destroy(){
__TRACE__
host.destroy();
path.destroy();
}
bool URL::urlFromString (const string& url,URL& myurl){
string host;
string protocol;
string path;
// __TRACE__
if (!isValidURL(url, protocol, host, path)){
myurl.SetDefined(false);
return false;
}
myurl.SetDefined(true);
myurl.setPath(path);
myurl.setProtocol (protocol);
myurl.setHost(host);
return true;
}
bool URL::isValidURL(const string& url, string& protocol,
string& host, string& path){
stringstream is (url);
WebLex lexer(&is);
// __TRACE__
lexer.switchStartCond(MSCHEME);
//cout << url << endl;
if (lexer.nextToken() != SCHEME){
// __TRACE__
return false;
}
protocol= lexer.getVal();
protocol= protocol.erase(protocol.length()-1);
// __TRACE__
//cout << protocol << endl;
if (lexer.nextToken() != AUTHORITY){
// __TRACE__
return false;
}
host= lexer.getVal();
host=host.erase(0,2);
// __TRACE__
//cout << host << endl;
if (lexer.nextToken() == PATH){
path= lexer.getVal();
}else{
path="";
}
//__TRACE__
//cout << lexer.getVal() << endl;
return true;
}
bool URL::isValidURL(const string& url){
string x,y,z;
__TRACE__
return isValidURL(url, x,y,z);
}
Flob *URL::GetFLOB(const int i){
// __TRACE__
if ( i == 0 )
return &host;
if ( i == 1 )
return &path;
return NULL;
}
void URL::WriteTo ( char* dest ) const {
__TRACE__
string url= getProtocol() + getHost() + getPath();
strcpy (dest, url.c_str());
}
SmiSize URL::SizeOfChars()const {
__TRACE__
return (strlen (protocol) + host.getSize() + path.getSize());
}
void URL::ReadFrom ( const char *src){
__TRACE__
int erg;
string url (src);
stringstream is (url);
WebLex lexer (&is);
lexer.switchStartCond(MURI);
string protocol;
string host;
string path;
erg= lexer.nextToken();
if (erg==ERROR)
return;
protocol= lexer.getVal();
erg= lexer.nextToken();
if (erg==ERROR)
return;
host= lexer.getVal();
erg= lexer.nextToken();
if (erg==ERROR)
return;
path= lexer.getVal();
setProtocol ( protocol);
setHost (host);
setPath (path);
}
size_t URL::HashValue(void) const{
__TRACE__
return SizeOfChars();
}
void URL::CopyFrom(const Attribute *arg){
__TRACE__
URL *url = (URL*) arg;
setProtocol ( url->getProtocol());
setHost ( url->getHost());
setPath ( url->getPath());
}
bool URL::operator== (const URL& url) const{
return (isEqual(url.getProtocol(),getProtocol()) &&
isEqual(url.getHost(), getHost()) &&
isEqual(url.getPath(), getPath()));
}
/*
3.2 Class ~HTML~
----
Example to create an object:
let html1 = [const html value ((instant (10 10 2006 10 27 18)) <file>/home/sopra/secondo/Algebras/Web/bilder.htm</file---> (url ("http" <text>www.mybilder.de</text---> <text>/</text---> )))]
----
*/
class HTML : public Attribute
{
public:
HTML(){}
~HTML(){}
HTML(const string& s);
HTML(const DateTime &d, const string &s, const URL &u);
HTML(const HTML&);
bool operator== (const HTML& h) const;
URL getSource() const;
string getContent() const;
string getText() const;
int getNumberOfUrls() const;
URL getUrl(const int i) ;
int getNumberOfEmbUrls() const;
URL getEmbUrl (const int i);
URL getUrlHosts(int i, string hosts, bool& contains);
bool containsURL( const URL*);
datetime::DateTime getLastModified() const;
string getMetaInfo(string name);
int getNumberOfMetainfos() const;
string getMetainfo( int ii, string& pContent) const;
int getNumberOf(string);
double similar(HTML*, int, bool);
HTML* Clone() const;
ListExpr ToListExpr(bool typeincluded)const;
bool IsDefined() const;
void SetDefined(bool d) ;
void Set(const HTML &h);
Flob* GetFLOB(const int i);
int NumOfFLOBs() const;
size_t Sizeof() const;
int Compare(const Attribute*) const;
bool Adjacent (const Attribute*)const;
const DbArray<FlobIndex>* getURLS()const;
const DbArray<FlobIndex>* getMetainfoKeys()const;
const DbArray<FlobIndex>* getMetainfoContents()const;
const DbArray<FlobIndex>* getEmbededURLS() const;
bool IsValid() const;
void CopyFrom(const Attribute *arg);
size_t HashValue(void) const;
static const string BasicType() { return "html"; }
static const bool checkType(const ListExpr type){
return listutils::isSymbol(type, BasicType());
}
private:
DateTime lastChange;
Flob source;
DbArray<flobindex> urls;
DbArray<flobindex> emburls;
DbArray<flobindex> metainfoKeys;
DbArray<flobindex> metainfoContents;
URL sourceURL;
bool defined;
int tiefe;
URL findNextURI(WebLex& lexer, flobindex& i, const string&, URL& url);
void analyseStructure(WebLex& lexer, int maxdepth, int& depth,
AnalyseList& al, int& error, int& symbol);
bool checkURI(string value,URL& url);
void getMetaInfos(const string&);
void filterEmbUrls(URL& u, flobindex& f);
void getUrls(const string&);
bool valid;
};
/*
3.2.1 Implementation of Class-Operations of ~HTML~
*/
HTML::HTML(const string& s)
:lastChange(instanttype),source(s.length()+1),
urls(0), emburls(0),metainfoKeys(0),metainfoContents(0),
sourceURL("http://"),defined(true),
tiefe(0), valid(true)
{
__TRACE__
//cout << "V1" << endl;
defined = true;
source.resize(s.length()+1);
source.write(s.c_str(),s.length()+1);
//tiefe=0;
//source.Put(0,s.length()+1,s.c_str());
valid=true;
getMetaInfos(s);
getUrls(s);
__TRACE__
//creates an HTML object without lastChange and sourceURL.
// If ~isValidHTML~ returns false, the object is not defined.
}
HTML::HTML(const DateTime &d, const string &s, const URL &u)
: lastChange(d),
source(s.length()+1),urls(0),emburls(0),metainfoKeys(0),
metainfoContents(0), sourceURL(u),defined(true),
tiefe(0),valid(true)
{
__TRACE__
//cout << "V2" << endl;
source.resize(s.length()+1);
source.write(s.c_str(), s.length() + 1);
//jps: Only Debug must be removed!!!!!!!!!!!
//cout << d.ToString() << " , " << u << endl;
//cout << "|" << s << "|" << endl;
valid=true;
// __TRACE__
getMetaInfos(s);
// __TRACE__
getUrls(s);
__TRACE__
//creates an HTML object. If ~isValidHTML~ returns false,
// the object is not defined.
}
HTML::HTML(const HTML& h)
:lastChange(h.getLastModified()),
source(0), urls(0), emburls(0),metainfoKeys(0),
metainfoContents(0), sourceURL(h.getSource()),
defined(h.IsDefined()),tiefe(0),valid(h.IsValid())
{
__TRACE__
//cout << "V3" << endl;
FlobIndex tmp;
const DbArray<FlobIndex> *tmpArray=0;
int i=0;
//__TRACE__
string c = h.getContent();
source.resize (c.length() +1 );
source.write(c.c_str(), c.length()+1);
// __TRACE__
tmpArray=h.getURLS();
for (i=0; i < tmpArray->Size();i++){
tmpArray->Get(i,tmp);
urls.Put(i, tmp);
}
//__TRACE__
tmpArray=h.getMetainfoKeys();
for (i=0; i < tmpArray->Size();i++){
tmpArray->Get(i,tmp);
metainfoKeys.Put( i, tmp);
}
//__TRACE__
tmpArray=h.getMetainfoContents();
for (i=0; i < tmpArray->Size();i++){
tmpArray->Get(i,tmp);
metainfoContents.Put( i, tmp);
}
// __TRACE__
/*
tmpArray=h.getEmbededURLS();
for (i=0; i < tmpArray->Size();i++){
tmpArray->Get(i,tmp);
emburls.Put( i, *tmp);
}
*/
// __TRACE__
}
HTML* HTML::Clone() const
{
__TRACE__
return new HTML( *this );
}
bool HTML::operator== (const HTML& h) const
{
__TRACE__
return (h.getContent() == this->getContent() &&
h.getSource() == this->getSource() &&
h.getLastModified() == this->getLastModified());
}
datetime::DateTime HTML::getLastModified() const
{
__TRACE__
return lastChange;
}
/*
returns the source - code of the html object
*/
string HTML::getContent() const
{
__TRACE__
if (!defined)
return "";
char s[source.getSize()];
source.read(s, source.getSize());
return string(s);
}
/*
returns the content of the html - elements
*/
string HTML::getText() const
{
//returns the content without tags, only text
__TRACE__
if (!valid)
return "";
int symbol=0;
string content;
WebLex lexer(0);
content= getContent();
//char out[content.length()+1];
string out="";
stringstream is (getContent());
lexer.yyrestart(&is);
lexer.switchStartCond (RELEM_WA);
symbol = lexer.nextToken();
while (symbol){
//cout << lexer.getVal() << endl;
if (symbol == ERROR){
cout << "Fehler" << endl;
return "";
}
if (symbol == CONTENT){
out += lexer.getVal();
} else{
//cout << "Token: " << symbol << ": " << lexer.getVal() << endl;
}
if (symbol == ELEMENT){
if (isEqual(lexer.getVal(), "script") ||
isEqual(lexer.getVal(), "style")){
symbol = lexer.nextToken();
while (symbol == CONTENT)
symbol= lexer.nextToken();
}
}
symbol= lexer.nextToken();
}
//cout << "*******" << content << endl;
return out;
}
URL HTML::getSource() const
{
// __TRACE__
return sourceURL;
}
ListExpr HTML::ToListExpr(bool typeincluded)const {
__TRACE__
if (!defined)
return HTML("").ToListExpr(typeincluded);
__TRACE__
Base64 b;
string content = getContent();
string textBytes;
b.encode( content.c_str(), content.size(), textBytes );
ListExpr value = nl->ThreeElemList(
getLastModified().ToListExpr(true),
nl->TextAtom(textBytes),
sourceURL.ToListExpr(true));
if(typeincluded)
{
return nl->TwoElemList(nl->SymbolAtom(HTML::BasicType()),value);
}
else
return value;
}
bool HTML::IsDefined() const {
__TRACE__
return defined;
}
void HTML::getUrls(const string& content){
string href;
WebLex lexer(0);
stringstream ss (content);
lexer.yyrestart(&ss);
flobindex i;
URL url("");
__TRACE__
findNextURI (lexer, i, content,url);
while (url.IsDefined()){
__TRACE__
//cout << "getUrls" << url.getPath() << endl;
urls.Append (i);
// filterEmbUrls(url,i); //has errors AB 11.2.07
//url=findNextURI(lexer, i, content);
findNextURI (lexer, i, content, url);
}
__TRACE__
}
/*
checks' wether the URL u ist a embeded URL
If so, the flobindex is appendes to emburls
*/
void HTML::filterEmbUrls (URL& u, flobindex& i){
__TRACE__
string name = u.getPath();
//cout << "---" << u.getPath() << endl;
int first =name.rfind(".");
if (first>0){
name= name.substr(first +1);
//cout << name << endl;
if (name == "jpg" || name == "jpeg" || name == "gif" || name == "bmp" ||
name == "png" || name =="tif")
emburls.Append(i);
}
}
int HTML::getNumberOfUrls() const
{
__TRACE__
return urls.Size();
//cout << urls.Size() << endl;
}
URL HTML::getUrl( int i)
{
__TRACE__
flobindex ind;
string content;
URL url("");
if (i < urls.Size()){
char s[source.getSize()];
source.read(s, source.getSize());
urls.Get(i, ind);
string tmp (s+ind.offset, ind.len);
content= tmp;
if (checkURI( content, url))
return URL(url);
}
return URL("");
}
int HTML::getNumberOfEmbUrls() const{
__TRACE__
return emburls.Size();
}
URL HTML::getEmbUrl( int i)
{
__TRACE__
flobindex ind;
string content;
URL url("");
if (i < emburls.Size()){
char s[source.getSize()];
source.read(s, source.getSize());
emburls.Get(i, ind);
string tmp (s+ind.offset, ind.len);
content= tmp;
if (checkURI( content, url))
return URL(url);
}
return URL("");
}
/*
checks, wether the host of getUrl(i) is equal to
one of hosts in the parameter ~hosts~
*/
URL HTML::getUrlHosts(int i, string hosts, bool& contains){
vector<string> vhosts;
vector<string>::const_iterator it;
string host="";
hosts+=",";
URL url= getUrl (i);
//cout << "Hosts übergeben: " << hosts << endl;
if( !hosts.length() )
{
contains = true;
return url;
}
contains=false;
if (!url.IsDefined() || !valid)
return url;
/*for (j=0;j < hosts.length();j++){
if (isWhite (hosts.at(j)))
hosts.erase(j,1);
}*/
SplitString( hosts,",",vhosts,false);
it= vhosts.begin();
host= url.getHost();
//cout << "Host enthalten: " << host << vhosts.size() << endl;
while(it != vhosts.end()){
//cout << "--- Host: " << host << ", Erlaubt: " << *it << endl;
if (isEqual(host, *it)){
//cout << "gleich" << endl;
contains =true;
return url;
}
it++;
}
return url;
}
bool HTML::containsURL(const URL *url){
string href;
int i=0;
__TRACE__
while (i < getNumberOfUrls()){
if (*url == getUrl (i))
return true;
i++;
}
return false;
}
/*
checks, wether value is a valid URL. Returns true if so, false otherwise
*/
bool HTML::checkURI(string value,URL& url){
WebLex lexer(0);
stringstream ss;
// __TRACE__
//cout << "Prfe URL " << value << endl;
url.SetDefined(false);
//check if this is a complete url
if (URL::urlFromString(value,url))
return true;
__TRACE__
//match a URL
lexer.switchStartCond (MURI);
ss << value;
lexer.yyrestart(&ss);
lexer.nextToken();
// we have the Path from a URL ~value~ and the source URL
//Now we try to build a valid url with protocol, host and path
__TRACE__
string path= getSource().getPath();
string urlpath=value;
string myurl="";
string mypath="";
int pos=0;
//
pos = urlpath.find("./");
if (pos == 0){
//Unterverzeichnis der source url
pos= path.rfind("/");
mypath=path.substr(0,pos );
urlpath= urlpath.substr (2);
} else{
pos= urlpath.find("/");
if (pos == 0){
//im Wurzelverzeichnis des Webservers
mypath="";
} else {
//Unterverzeichnis der source url
pos= path.rfind("/");
mypath= path.substr(0,pos );
}
}
if (urlpath.find("/") == 0){
urlpath= urlpath.substr(1);
}
myurl=urlpath;
//cout << myurl << " --- " << mypath << endl;
while (true){
pos = myurl.find("../");
//parent directory
if (pos == 0){
myurl= myurl.substr(3);
pos= mypath.rfind("/");
//cout << "1:" << mypath << endl;
if (pos < 0){
//cout << "error parsing url" << endl;
return false;
} else {
mypath= mypath.substr(0, pos );
//cout << "2. " << mypath << endl;
}
}else {
pos= myurl.find("/");
if (pos == -1)
break;
mypath= mypath + "/" + myurl.substr(0,pos);
myurl= myurl.substr(pos +1);
}
}
//__TRACE__
url.SetDefined(true);
url.setProtocol (getSource().getProtocol());
url.setHost(getSource().getHost());
//cout << "checkuri " << mypath << " " << myurl;
url.setPath(mypath + "/" + myurl);
//cout << "Neue URL :" << url.getPath() << endl;
//__TRACE__
return true;
}
/*
in:lexer
in:content (COontent of HTML Object)
out:i (FlobIndex for the URL)
out:url (the found URL Object)
find NextUri in the stream of ~lexer~
*/
URL HTML::findNextURI(WebLex& lexer, flobindex& i,
const string& content, URL& url ){
string element, value;
int symbol=0;
//URL url("");
// __TRACE__
url.SetDefined(false);
//vector<string> attributes;
//attributes.push_back("src");
//attributes.push_back("href");
symbol= lexer.startElement(element);
while (symbol){
__TRACE__
if (isEqual(element, "img")){
if (lexer.findAttribute("src",value)){
if (checkURI(value,url)){
i= lexer.setPos(value,content);
return url;
}
}
}
/*if (!isEqual(element,"script")){
// __TRACE__
if (lexer.findAttribute(attributes,value)){
__TRACE__
if(checkURI(value,url)){
__TRACE__
i=lexer.setPos(value, content);
//cout << "StartKopie" << url.getPath() << endl;
return url;
}
}
}*/
// __TRACE__
if (lexer.findAttribute("href",value)){
//__TRACE__
if(checkURI(value,url)){
//__TRACE__
i=lexer.setPos(value, content);
return url;
}
}
if (isEqual(element,"script")){
__TRACE__
//cout << element << endl;
if (lexer.findAttribute("src",value)){
if (checkURI(value,url)){
i= lexer.setPos(value,content);
}
}
// __TRACE__
symbol= lexer.nextToken();
element=lexer.getVal();
while(symbol == CONTENT){
symbol= lexer.nextToken();
element= lexer.getVal();
}
//cout << "------------" << lexer.getVal() << symbol << endl;
if (url.IsDefined())
return url;
}else{
__TRACE__
symbol=lexer.startElement(element);
}
}
return url;
}
int HTML::getNumberOfMetainfos() const
{
__TRACE__
//cout << metainfoKeys.Size() << endl;
return metainfoKeys.Size();
}
string HTML::getMetainfo( int i, string& pContent) const
{
__TRACE__
//returns the key of metainfo number ii
//fills pContent with the content of the metainfo number ii
flobindex ind;
char content[source.getSize()];
source.read(content, source.getSize());
if (i < metainfoKeys.Size()){
metainfoContents.Get (i, ind);
string tmp (content+ind.offset, ind.len);
pContent= tmp;
metainfoKeys.Get( i, ind);
return string (content+ind.offset, ind.len);
}
return "";
}
string HTML::getMetaInfo(string name){
__TRACE__
int i=0;
string content;
for (i=0; i< getNumberOfMetainfos();i++){
if (isEqual(getMetainfo(i, content),name)){
return content;
}
}
return "";
}
/*
find all Metainfos in ~content~ and append them to
the attributes ~metainfoContents~ and ~metainfoKeys~
*/
void HTML::getMetaInfos(const string& content){
// __TRACE__
string attname;
flobindex ikey, icontent;
int symbol=0;
string value("");;
stringstream ss (content);
WebLex lexer (&ss);
vector<string> attributes;
attributes.push_back("content");
attributes.push_back("name");
//cout << "getMeta Content " << content << endl;
symbol=lexer.startElement(attname);
// __TRACE__
while (symbol){
//cout << "getMeta Content " << attname << endl;
if (isEqual (attname, "/head"))
return;
if (symbol== EIDENTIFIER && isEqual (attname, "meta")){
// __TRACE__
string tmp("");
if (lexer.findAttribute(attributes,value,tmp)){
//cout << "--" << value << endl;
if (isEqual(tmp,"name")){
ikey= lexer.setPos(value, content);
}else{
icontent= lexer.setPos(value, content);
}
if (lexer.findAttribute(attributes,value,tmp)){
if (isEqual(tmp,"name")){
ikey= lexer.setPos(value, content);
}else{
icontent= lexer.setPos(value, content);
}
metainfoContents.Append (icontent);
metainfoKeys.Append (ikey);
}
}
}
if (isEqual(attname,"script")){
//cout << "******* Treffer **********" << endl;
lexer.switchStartCond(RSCRIPT);
symbol= lexer.nextToken();
attname= lexer.getVal();
while(symbol == CONTENT){
symbol= lexer.nextToken();
attname=lexer.getVal();
}
}else{
symbol=lexer.startElement(attname);
}
}
}
/*
return the number of the occurences of the element in this Object
*/
int HTML::getNumberOf(string element){
__TRACE__
int count=0;
string e="";
int symbol;
stringstream ss (getContent());
WebLex lexer (&ss);
//cout << getContent << endl;
if (!valid)
return 0;
lexer.switchStartCond(RELEM_WA);
symbol = lexer.nextToken();
while (symbol){
e= lexer.getVal();
//read content
__TRACE__
lexer.readContent();
if (isEqual (e, element))
count++;
//next element
symbol= lexer.nextToken();
}
return count;
}
/*
analyse Structure of html object
*/
void HTML::analyseStructure(WebLex& lexer, int maxdepth, int& depth,
AnalyseList& al, int& error, int& symbol){
// __TRACE__
int sym1=0;
string element;
lexer.switchStartCond (RELEM_WA);
//cout << "***** Rein " << tiefe << " ********* " << endl;
depth++;
//cout << "nextToken 1" << endl;
symbol= lexer.nextToken();
//cout << "analyse: " << symbol << endl;
while (symbol == 10000){
symbol= lexer.nextToken();
//cout << "analyse: " << symbol << endl;
}
while (symbol && !error){
//cout << "TAG Name: " << lexer.getVal() << " " << symbol << endl;
if (symbol != ELEMENT && symbol !=COMMENT &&
symbol != ELEMENT_SA && symbol !=ELEMENT_CLOSE){
error=-1;
cout << "1 ERROR " << lexer.getVal() << symbol << endl;
return ;
}
if (symbol != ELEMENT_CLOSE && lexer.getVal()[0] == '/'){
cout << " 2 ERROR " << lexer.getVal() << " " << symbol << endl;
error=-1;
return ;
}
element= lexer.getVal();
if (isEqual(element, "/html")){
symbol=0;
return;
}else{
//cout << "endetest:" << element << endl;
}
//Read content of current element <tag>content<....
// it is maby empty <tag><tag>
//cout << "nextToken 2" << endl;
if ((sym1=lexer.readContent()) != CONTENT){
symbol=sym1;
if (!symbol)
return;
cout << "3 ERROR CONTENT" << lexer.getVal() << symbol << endl;
error =-1;
return ;
}
//cout << "Content " << lexer.getVal() << endl;
if (symbol == ELEMENT_CLOSE){
//cout << "nextToken 3" << endl;
symbol=lexer.nextToken();
//cout << "Element_close " << element << endl;
element= element.substr (1);
break;
}
if (symbol == ELEMENT){
//we have to check every single standalone html attribute
if (isEqual (element,"area") || isEqual (element,"base") ||
isEqual (element,"basefont") || isEqual (element,"br") ||
isEqual (element,"col") || isEqual (element,"frame") ||
isEqual (element,"hr") || isEqual (element,"img") ||
isEqual (element,"img") || isEqual (element,"input") ||
isEqual (element,"isindex") || isEqual (element,"link") ||
isEqual (element,"meta") || isEqual (element,"param") ||
isEqual (element,"param")){
//cout << "SA Element " << element << endl;
//cout << "nextToken 4" << endl;
symbol= lexer.nextToken();
}else{
if ((depth <= maxdepth) ||maxdepth < 0)
al.push_back ( element );
analyseStructure(lexer, maxdepth, depth, al, error, symbol);
//cout << "Zurck " << symbol << endl;
}
} else if (symbol == ELEMENT_SA || symbol == COMMENT){
//cout << "SA Element " << element << endl;
if ((depth <= maxdepth) || maxdepth < 0)
al.push_back ( element );
//cout << "nextToken 5" << endl;
symbol= lexer.nextToken();
}
else {
cout << "5 Error" << element << " " << symbol << endl;
error=-1;
return;
}
}
depth--;
return;
}
double HTML::similar(HTML *html, int maxdepth, bool respectOrder){
__TRACE__
AnalyseList *al1, *al2, *al3, *al4, *al;
int counter=0;
int depth=0;
int error=0;
int symbol=0;
AnalyseList::const_iterator it1,it2;
if (!valid || !html->IsValid())
return 0;
al1= new AnalyseList();
string tmp1=getContent();
stringstream ss1(tmp1);
WebLex lexer (&ss1);
analyseStructure(lexer, maxdepth, depth, *al1, error, symbol);
depth=0;
symbol=0;
error=0;
al2= new AnalyseList();
string tmp2 = html->getContent();
stringstream ss2(tmp2);
lexer.yyrestart(&ss2);
analyseStructure(lexer, maxdepth, depth, *al2,error, symbol);
if (respectOrder){
if (al2->size() > al1->size()){
__TRACE__
al= al2;
al2= al1;
al1= al;
}
it1= al1->begin();
it2= al2->begin();
//cout << al1->size() << " " << al2->size() << endl;
while ((it1 != al1->end() && it2 !=al2->end())){
if (isEqual(it1->getElement(), it2->getElement())){
//cout << "treffer" << it1->getElement() << endl;
counter++;
if (!(it1 != al1->end() && it2 !=al2->end())){
break;
}
it1++;
it2++;
}else {
if (!al1->find( it1, it2->getElement())){
//cout << "nicht gefunden " << it2->getElement() << endl;
if (!(it1 != al1->end() && it2 !=al2->end())){
break;
}
it2++;
}else {
if (!(it1 != al1->end() && it2 !=al2->end())){
break;
}
it1++;
//cout << "gefunden " << it2->getElement() << endl;
}
}
}
//cout << "-------" << counter << endl;
if ((double) al1->size() == 0)
return 0;
return (double) counter / (double) al1->size();
}
al3= new AnalyseList();
al4= new AnalyseList();
it1 = al1->begin();
it2 = al2->begin();
while (it1 != al1->end()){
al3->add(it1->getElement());
it1++;
}
while (it2 != al2->end()){
al4->add(it2->getElement());
it2++;
}
al1=al3;
al2=al4;
if (al2->size() > al1->size()){
__TRACE__
al= al2;
al2= al1;
al1= al;
}
it1= al1->begin();
it2= al2->begin();
//cout << al1->size() << " " << al2->size() << endl;
while ((it1 != al1->end() && it2 !=al2->end())){
if (isEqual(it1->getElement(), it2->getElement())){
//cout << "treffer" << it1->getElement() << endl;
counter++;
if (!(it1 != al1->end() && it2 !=al2->end())){
break;
}
it1++;
it2++;
}else {
if (!al1->find( it1, it2->getElement())){
//cout << "nicht gefunden " << it2->getElement() << endl;
if (!(it1 != al1->end() && it2 !=al2->end())){
break;
}
it2++;
}else {
if (!(it1 != al1->end() && it2 !=al2->end())){
break;
}
it1++;
//cout << "gefunden " << it2->getElement() << endl;
}
}
}
__TRACE__
if (al1->size() == 0)
return (double) 0;
return (double) counter / (double) al1->size();
}
void HTML::Set(const HTML &h)
{
FlobIndex tmp;
const DbArray<FlobIndex> *tmpArray=0;
int i=0;
__TRACE__
if (!h.IsDefined())
return;
valid= h.IsValid();
defined=true;
DateTime d = h.getLastModified();
lastChange.SetType(instanttype);
lastChange.Set(d.GetYear(),d.GetMonth(), d.GetGregDay(), d.GetHour(),
d.GetMinute(), d.GetSecond(),d.GetMillisecond());
URL u(h.getSource());
sourceURL.Set(true,u);
string s = h.getContent();
source.resize( s.length() + 1 );
source.write(s.c_str(), s.length() + 1);
string c = h.getContent();
source.resize (c.length() +1 );
source.write(c.c_str(),c.length()+1);
tmpArray=h.getURLS();
for (i=0; i < tmpArray->Size();i++){
tmpArray->Get(i,tmp);
urls.Put(i, tmp);
}
tmpArray=h.getMetainfoKeys();
for (i=0; i < tmpArray->Size();i++){
tmpArray->Get(i,tmp);
metainfoKeys.Put( i, tmp);
}
tmpArray=h.getMetainfoContents();
for (i=0; i < tmpArray->Size();i++){
tmpArray->Get(i,tmp);
metainfoContents.Put( i, tmp);
}
}
int HTML::NumOfFLOBs() const{
__TRACE__
return 7;
}
Flob *HTML::GetFLOB(const int i){
// __TRACE__
//assert (i < NumOfFLOBs());
if (i==0)
return &source;
if (i==1)
return &urls;
if (i==2)
return &metainfoKeys;
if (i==3)
return &metainfoContents;
if (i==4)
return &emburls;
if (i==5)
return sourceURL.GetFLOB(0);
if (i==6)
return sourceURL.GetFLOB(1);
return NULL;
}
size_t HTML::Sizeof() const{
return sizeof(HTML);
}
int HTML::Compare(const Attribute*) const{
return 0;
}
bool HTML::Adjacent (const Attribute*)const{
return 0;
}
void HTML::SetDefined(bool d) {
__TRACE__
defined=d;
}
const DbArray<FlobIndex>* HTML::getURLS() const{
return &urls;
}
const DbArray<FlobIndex>* HTML::getMetainfoKeys()const{
return &metainfoKeys;
}
const DbArray<FlobIndex>* HTML::getMetainfoContents() const{
return &metainfoContents;
}
bool HTML::IsValid() const{
return valid;
}
void HTML::CopyFrom(const Attribute* right)
{
__TRACE__
const HTML *r = (const HTML *)right;
lastChange = r->getLastModified();
source.resize( r->source.getSize() );
char bin[r->source.getSize()];
r->source.read(bin, r->source.getSize() );
source.write( bin, r->source.getSize());
sourceURL.setProtocol( r->getSource().getProtocol());
sourceURL.setHost( r->getSource().getHost());
sourceURL.setPath( r->getSource().getPath());
defined = r->IsDefined();
valid=true;
tiefe=0;
urls.clean();
metainfoKeys.clean();
metainfoContents.clean();
getMetaInfos(bin);
getUrls(bin);
}
size_t HTML::HashValue(void) const
{
return 0;
}
const DbArray<FlobIndex>* HTML::getEmbededURLS() const{
return &emburls;
}
/*
3.3 Class ~Page~
----
Example to create an object:
let page1 = [const page value ((html ((instant (10 10 2006 10 27 18)) <file>/home/sopra/secondo/Algebras/Web/bilder.htm</file---> (url ("http" <text>www.myimages.de</text---> <text>/</text---> )))) ((url ("http" <text>Garten-1.jpg</text---> <text>/</text---> )) <file>/home/sopra/secondo/Algebras/Web/Garten-1.jpg</file---> "image/jpeg")( (url ("http" <text>Garten-2.jpg</text---> <text>/</text---> )) <file>/home/sopra/secondo/Algebras/Web/Garten-2.jpg</file---> "image/jpeg"))]
----
*/
class Page : public HTML
{
public:
Page(){}
~Page(){}
Page(const string &s);
Page(const HTML &);
Page(const Page &);
Page(const URL &url, string &mime, string &binFile, DateTime &dt);
bool operator== (const Page& h) const;
HTML extractHTML();
int numOfFiles() const;
URL getUrl(int i) const;
string getText( int i) const;
string getMime( int i) const;
void addEmbObject(const URL &u, const string &mime, const string &s);
bool IsDefined() const;
void SetDefined(bool d) ;
Flob *GetFLOB(const int i);
int NumOfFLOBs() const;
size_t SizeOf() const;
int Compare(const Attribute*) const;
bool Adjacent (const Attribute*)const;
void CopyFrom(const Attribute *arg);
Page* Clone() const;
static const string BasicType() { return "page"; }
static const bool checkType(const ListExpr type){
return listutils::isSymbol(type, BasicType());
}
private:
/*Class ~HTTPSocket~
This Page classes inner class is designed to capsulate all details
of the socket´s implementation and the page request, depending on the
http protocol. It is an inner private class because, up to now, the
page class is the only object connecting to the web.
*/
class HTTPSocket
{
public:
enum HTTPProtocol {HTTP_10, HTTP_11};
HTTPSocket(string webAddr, string filePath, HTTPProtocol proto,
string port);
inline const string getServerAddress() {return WebAddr;}
//returns the string represantation of an valid http get request
const string getGetRequest();
inline Socket * getSocket() {return s;}
bool parseHTTPResponse(vector<string> serverResponse);
inline string getContentType() {return contentType;}
inline int getContentLength() {return contentLength;}
inline DateTime getLastModified() {return lastModified;}
inline bool getSuccessResponded() {return successResponded;}
inline bool Close() { return s->Close();}
inline bool getChunked(){ return isChunked;}
private:
string WebAddr;
string FilePath;
HTTPProtocol Protocol;
string Port;
string contentType;
int contentLength;
DateTime lastModified;
DateTime responseDate;
bool successResponded;
bool isChunked;
Socket *s;
bool setLastModified(string s);
bool setResponseDate(string s);
DateTime setDateTime(string s);
string getMonthNumFromName(string monthName);
};
public:
static string getFromWeb(URL url, string &mime, bool &MimeIsEqual,
DateTime &dt, bool onlyHtml = false);
private:
struct FLOBIndex
{
int offset;
int len;
};
int numOfEmbeddedObjects;
DbArray<FLOBIndex> embUrlIds;
Flob embUrls;
DbArray<FLOBIndex> binIDs;
Flob binFiles;
DbArray<FLOBIndex> mimeIDs;
Flob mimeTypes;
bool allocateOneElem(int BytesOfData, int BytesOfURL, int BytesOfMime);
bool allocateSpaceInArray(DbArray<FLOBIndex> *dba, int numOfBytes);
URL getURLFromString(string &s) const;
bool checkEmbUrl(URL &u);
static const int MAXBUFFERSIZE = 1000000;
};
/********************OVERWRITING ATTRIBUTE************************/
bool Page::IsDefined() const
{
return HTML::IsDefined();
}
void Page::SetDefined(bool d)
{
HTML::SetDefined(d);
}
Flob* Page::GetFLOB(const int i)
{
#ifdef _DEBUG_JPS
cout << "FLOB* Page::GetFLOB(const int i):" << i << endl;
cout << HTML::NumOfFLOBs() << endl;
cout << NumOfFLOBs() << endl;
#endif
if (i < (NumOfFLOBs() - HTML::NumOfFLOBs())){
switch (i)
{
case 0: return &embUrlIds;
case 1: return &embUrls;
case 2: return &binIDs;
case 3: return &binFiles;
case 4: return &mimeIDs;
case 5: return &mimeTypes;
default: return NULL;
}
}
if (i < NumOfFLOBs()){
// __TRACE__
//cout << " > "<<(i - (NumOfFLOBs() - HTML::NumOfFLOBs())) << endl;
return HTML::GetFLOB(i - (NumOfFLOBs() - HTML::NumOfFLOBs()));
}else{
__TRACE__
return NULL;
}
}
int Page::NumOfFLOBs() const
{
__TRACE__
return 6 + HTML::NumOfFLOBs();
}
size_t Page::SizeOf() const
{
return sizeof(Page);
}
int Page::Compare(const Attribute*) const
{
return 0;
}
bool Page::Adjacent (const Attribute*)const
{
return false;
}
Page* Page::Clone() const
{
__TRACE__
return new Page( *this );
}
void Page::CopyFrom(const Attribute* right)
{
__TRACE__
const Page *r = (const Page *)right;
HTML::CopyFrom(right);
numOfEmbeddedObjects = 0;
for( int ii = 0; ii < r->numOfFiles(); ++ii)
{
addEmbObject(r->getUrl(ii), r->getMime(ii), r->getText(ii));
}
}
/*
3.2.1 Implementation of Class-Operations of ~Page~
*/
Page::Page(const string &s)
: HTML(s), numOfEmbeddedObjects(0), embUrlIds(0), embUrls(0),
binIDs(0), binFiles(0), mimeIDs(0), mimeTypes(0)
{
#ifdef _DEBUG_JPS_3
cout << "Page::Page(const string &s)" << endl;
#endif
__TRACE__
}
Page::Page(const HTML &h)
: HTML(h), numOfEmbeddedObjects(0), embUrlIds(0), embUrls(0),
binIDs(0), binFiles(0), mimeIDs(0), mimeTypes(0)
{
//NOT USED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#ifdef _DEBUG_JPS_3
cout << "Page::Page(const HTML &h)" << endl;
#endif
//generate a page object without emb.urls
//the size of the emb obj. has to set to 0
__TRACE__
}
Page::Page(const Page &p)
: HTML(p), numOfEmbeddedObjects(0), embUrlIds(0), embUrls(0),
binIDs(0), binFiles(0), mimeIDs(0), mimeTypes(0)
{
__TRACE__
for( int ii = 0; ii < p.numOfFiles(); ++ii)
{
addEmbObject(p.getUrl(ii), p.getMime(ii), p.getText(ii));
}
}
Page::Page(const URL &url, string &mime, string &binFile, DateTime &dt)
: HTML(dt, binFile, url),
numOfEmbeddedObjects(0), embUrlIds(0), embUrls(0),
binIDs(0), binFiles(0), mimeIDs(0), mimeTypes(0)
{
__TRACE__
#ifdef _DEBUG_JPS
cout << "Page::Page(const URL &url, string &mime,"
" string &binFile, DateTime &dt) "
<< HTML::getNumberOfUrls() << endl;
#endif
for (int i= 0; i < HTML::getNumberOfUrls(); i++)
{
#ifdef _DEBUG_JPS
cout << "Page::Page(const URL &url, string &mime,"
" string &binFile, DateTime &dt) " << i<< endl;
#endif
URL embUrl(HTML::getUrl(i));//getEmbUrl(i);
if( checkEmbUrl(embUrl) )
{
DateTime dt;
string theMime;
bool mustBeEqual = false;
if (embUrl.getHost() != "error")
{
string embCont = getFromWeb(embUrl, theMime, mustBeEqual, dt);
addEmbObject(embUrl, theMime, embCont);
}
}
}
}
bool Page::checkEmbUrl(URL &u)
{
string filename = u.getPath();
int first =filename.rfind(".");
if (first>0){
string name = filename.substr(first +1);
if (name == "jpg" || name == "jpeg" || name == "gif" ||
name == "bmp" || name == "png" || name =="tif"){
return true;
}
}
return false;
}
bool Page::operator== (const Page& h) const
{
__TRACE__
if (this->numOfFiles() == h.numOfFiles())
{
for (int i = 0; i < this->numOfFiles(); i++)
{
Page &p = const_cast<Page&>(h);
Page *self = const_cast<Page*>(this);
if (!(self->getUrl(i) == p.getUrl(i))) return false;
if (!(self->getMime(i) == p.getMime(i))) return false;
if (!(self->getText(i) == p.getText(i))) return false;
}
return true;
}
return false;
}
HTML Page::extractHTML()
{
__TRACE__
return *this;
}
int Page::numOfFiles() const
{
__TRACE__
#ifdef _DEBUG_JPS_3
cout << "Page::numOfFiles()" << numOfEmbeddedObjects <<endl;
#endif
return numOfEmbeddedObjects;
}
URL Page::getUrl(int i) const
{
__TRACE__
if(i < numOfEmbeddedObjects)
{
//Get the right url flobindex..
FLOBIndex getThisUrl;
embUrlIds.Get(i, getThisUrl);
#ifdef _DEBUG_JPS
//cout << "getUrl: " << (*getThisUrl).offset << endl;
#endif
//..and get the url..
char c[getThisUrl.len];
embUrls.read(c, getThisUrl.len, getThisUrl.offset);
string result(c);
#ifdef _DEBUG_JPS_3
//cout <<"getUrl: >1ind:" << i << " >2url: " << result <<
// " >3offset: " << (*getThisUrl).offset <<endl;
#endif
return getURLFromString(result);
}
return URL("http", "error", "error"); //TODO Handle this!
}
string Page::getText( int i) const
{
__TRACE__
if(i < numOfEmbeddedObjects)
{
//Get the right bin index..
FLOBIndex getThisBin;
binIDs.Get(i, getThisBin);
//..and get the bin data..
char c[getThisBin.len];
binFiles.read(c, getThisBin.len, getThisBin.offset);
string result(c);
#ifdef _DEBUG_JPS_3
//cout <<"getMime: >1ind:" << i << " >2mime: " <<
//" >3offset: " << (*getThisMime).offset <<endl;
#endif
//result.erase((*getThisMime).len, result.size());
//erase not needed cause trailing zero was saved
return result;
}
return "Error Page::GetText wrong Index!"; //TODO Handle this!
}
string Page::getMime( int i) const
{
__TRACE__
if(i < numOfEmbeddedObjects)
{
//Get the right bin index..
FLOBIndex getThisMime;
mimeIDs.Get(i, getThisMime);
//..and get the bin data..
char c[getThisMime.len];
mimeTypes.read(c, getThisMime.len, getThisMime.offset);
string result(c);
#ifdef _DEBUG_JPS_3
//cout <<"getMime: >1ind:" << i << " >2mime: " << result <<
//" >3offset: " << (*getThisMime).offset <<endl;
#endif
//result.erase((*getThisMime).len, result.size());
//erase not needed cause trailing zero was saved
return result;
}
return "Error Page::GetMime wrong Index!"; //TODO Handle this!
}
//Stores embedded object, containing an url, the binaries and the mime-type
void Page::addEmbObject(const URL &u, const string &mime, const string &s)
{
__TRACE__
//If the new object is valdid..
if (u.IsDefined() && (s.size() > 0) && (mime.size() > 0))
{
//Create an easy to use string represantation of the url
string s_url = u.getProtocol() + "://" + u.getHost() + u.getPath();
if (allocateOneElem(s.size() +1, s_url.size()+1, mime.size()+1))
{
/******************URL**********************/
FLOBIndex insertUrlHere;
embUrlIds.Get(numOfEmbeddedObjects - 1, insertUrlHere);
embUrls.write(s_url.c_str(),
insertUrlHere.len + 1,
insertUrlHere.offset);
/******************MIME**********************/
FLOBIndex insertMimeHere;
mimeIDs.Get(numOfEmbeddedObjects - 1, insertMimeHere);
mimeTypes.write(mime.c_str(),
insertMimeHere.len + 1,
insertMimeHere.offset);
/******************BINARY**********************/
FLOBIndex insertBinHere;
binIDs.Get(numOfEmbeddedObjects - 1, insertBinHere);
binFiles.write(s.c_str(),
insertBinHere.len + 1,
insertBinHere.offset);
}
}
}
bool Page::allocateOneElem(int BytesOfData, int BytesOfURL, int BytesOfMime)
{
//Inc the number of embedded objects
__TRACE__
++numOfEmbeddedObjects;
//Prepare the bin and url DBArrays to take the new object..
__TRACE__
if (allocateSpaceInArray(&binIDs, BytesOfData)
&& allocateSpaceInArray(&embUrlIds, BytesOfURL)
&& allocateSpaceInArray(&mimeIDs, BytesOfMime))
{
//.. and allocate the right amount of memory in the flobs!
FLOBIndex resizeUrlIndex;
embUrlIds.Get(numOfEmbeddedObjects - 1, resizeUrlIndex);
embUrls.resize(embUrls.getSize() + resizeUrlIndex.len + 1);
FLOBIndex resizeBinIndex;
binIDs.Get(numOfEmbeddedObjects - 1, resizeBinIndex);
binFiles.resize(binFiles.getSize() + resizeBinIndex.len + 1);
FLOBIndex resizeMimeIndex;
mimeIDs.Get(numOfEmbeddedObjects - 1, resizeMimeIndex);
mimeTypes.resize(mimeTypes.getSize() + resizeMimeIndex.len + 1);
return true;
}
//Something went wrong - no element can be added (should not occur)!
--numOfEmbeddedObjects;
return false;
}
bool Page::allocateSpaceInArray(DbArray<FLOBIndex> *dba, int numOfBytes)
{
//Get the index and offset of the previous element..
__TRACE__
FLOBIndex pIndex;
if (numOfEmbeddedObjects > 1)
{
__TRACE__
FLOBIndex prevIndex;
dba->Get(numOfEmbeddedObjects - 2, prevIndex);
pIndex.offset = prevIndex.offset;
pIndex.len = prevIndex.len;
}
//..or set index and length to 0 if the element is the first!
else
{
__TRACE__
pIndex.offset = 0;
pIndex.len = 0;
}
//Now we can calculate the new offset and length..
__TRACE__
FLOBIndex newIndex;
newIndex.offset = pIndex.offset + pIndex.len;
newIndex.len = numOfBytes;
//..and append it to the DBArray!
dba->Append(newIndex);
__TRACE__
return true;
}
URL Page::getURLFromString(string &s) const
{
//This method expects the following format:
//<protocol>://<host>/<path>
int pos1 = s.find("://", 1);
if (pos1 != (int)string::npos)
{
string s_prot(""), s_myHost(""), s_path("");
s_prot.append(s, 0, pos1);
int pos2 = s.find("/", pos1 + 3);
if (pos2 != (int)string::npos)
{
s_myHost.append(s, pos1+3, pos2 - (pos1 + 3));
s_path.append(s, pos2, s.size());
}
else s_myHost.append(s, pos1+3, s.size());
return URL(s_prot, s_myHost, s_path);
}
return *(new URL());
}
/*
3.2.1.1 If the Page as HTML Instance is not defined and the content type is text/html,
the data will be used to fill the instance as html object. Elsewise everything is interpreted as an embedded object of the page instance itself and so it is added as an embedded object.
TODO: The return type must be defined - it will not be a string!!!!!
*/
string Page::getFromWeb(URL url, string &mime, bool &MimeIsEqual,
DateTime &dt, bool onlyHtml)
{
__TRACE__
//Set the HTTP Protocol
HTTPSocket::HTTPProtocol httpProt;
httpProt = HTTPSocket::HTTP_11;
//Get an Instance of the HTTPSocket class..
HTTPSocket httpSock(url.getHost(), url.getPath(), httpProt, "80");
//TODO: only http supported!
//..and use the os independent socket!
Socket *s = httpSock.getSocket();
//Get the corresponding http GET request as a string..
string req = httpSock.getGetRequest();
string result("");
//cout << "http request: " << req << " , " << req.size() << endl;
if (s->IsOk())
{
//..and write it to the socket!
iostream& io = s->GetSocketStream();
io << req << endl;
string line("");
bool readyForBinData = false;
vector<string> serverResponse;
int size = 0;
int packetsize = 0;
char byte = 0x00;
while(s->IsOk())
{
if (!readyForBinData) //Server http response not completly received yet..
{
getline(io,line);
// cout << "Line: " << line << endl;
//..response finalized..
if (line.find("\r") == 0) //..parse it!
{
readyForBinData = httpSock.parseHTTPResponse(serverResponse);
if (!readyForBinData)
{
result = "not ready for response";
mime = "error";
Base64 b;
string binBytes;
b.encode( result.c_str(), result.size(), binBytes );
httpSock.Close();
return binBytes;
}
if (mime.size() > 0) //stops and returns false if different mime types
{
if((mime.find(httpSock.getContentType(), 0) == string::npos))
{
if (MimeIsEqual)
{
MimeIsEqual = false;
httpSock.Close();
return "";
}
MimeIsEqual = false;
}
}
if( onlyHtml )
{
mime = httpSock.getContentType();
if((mime.find(HTML::BasicType()) == string::npos)){
MimeIsEqual = false;
httpSock.Close();
return "";
}
onlyHtml = false;
}
if( !httpSock.getChunked())
{
result.reserve(httpSock.getContentLength()+1);
}
}
else //..append the line to the server´s response!
{
serverResponse.push_back(line);
}
}
else //..receive the binary data!
{
// if (size%1000 == 0) cout << "1000 Zeichen gelesen!" << endl;
if(httpSock.getChunked() && packetsize<=0)
{
getline(io,line);
// cout << line << endl;
if(line.length()>1) //perhaps empty line
{
//files come in packets of n-bytes
packetsize = (int)strtol(line.c_str(),NULL,16);
// cout << "Line Bytes: " << packetsize << endl;
if(!packetsize){break;}
result.reserve(result.size() + packetsize);
}
}
else
{
io.get(byte);
if (true)//(s->Read(&byte, 1, 1, 1) > 0)
{
result += byte;
size++;
if(httpSock.getChunked()) --packetsize;
}
else
{
//cout << "TIMEOUT nach " << size -1 << " Zeichen!" << endl;
httpSock.Close();
break;
}
if ((httpSock.getContentLength() > 0) &&
(size >= httpSock.getContentLength())) {break;}
}
}
}
mime = httpSock.getContentType();
dt = httpSock.getLastModified();
httpSock.Close();
__TRACE__
}
MimeIsEqual = false;
if( mime.find(HTML::BasicType()) != string::npos)
{
MimeIsEqual = true;
}
if( !MimeIsEqual )
{
//binary data encode base64
if( !result.size() )
{
result = "not found";
mime = "error";
}
Base64 b;
string binBytes;
b.encode( result.c_str(), result.size(), binBytes );
return binBytes;
}
else { return result;}
}
/*
3.2.1 Implementation of Class-Operations of ~HTTPSocket~ - private inner class of Page
*/
/*
3.2.1.1 Allocates an os dependent socket and offers an instance of
abstract Socket type, hiding the os dependancy.
*/
Page::HTTPSocket::HTTPSocket(string webAddr, string filePath,
HTTPProtocol proto, string port):
WebAddr(webAddr), FilePath(filePath), Protocol(proto),
Port(port), contentType(""), contentLength(-1),
successResponded(false), isChunked(false)
{
lastModified.SetType(instanttype);
responseDate.SetType(instanttype);
s = Socket::Connect(webAddr , port);
}
/*
3.2.1.2 Returns the http get request as const string.
*/
const string Page::HTTPSocket::getGetRequest()
{
string result("");
result += "GET " + FilePath;
(Protocol == HTTP_10) ? result += " HTTP/1.0" : result += " HTTP/1.1";
result += "\r\nHost: " + WebAddr + ":" + Port + "\r\n";
return result;
}
/*
3.2.1.3 Extracts the relevant items out of the strings given
by the vector. Will return true if there is no error transmitted
by the server.
Example:
HTTP/1.1 200 OK
Server: Apache/1.3.29 (Unix) PHP/4.3.4
Content-Length: (Größe von infotext.html in Byte)
Last-Modified: Sat, 28 Oct 2006 18:40:44 GMT
Content-Language: de
Content-Type: text/html
Connection: close
*/
bool Page::HTTPSocket::parseHTTPResponse(vector<string> serverResponse)
{
//cout << "serverresponse:" << endl;
bool gotLastMod = false;
bool gotDate = false;
// bool isChunked = false;
for (vector<string>::iterator iter = serverResponse.begin();
iter != serverResponse.end(); iter++)
{
//cout << (*iter) << endl;
//Protocol and error code..
if ((*iter).find("HTTP/1.0", 0) != string::npos)
{
#ifdef _DEBUG_JPS
cout << "found HTTP/1.0 " << endl;
#endif
}
else if ((*iter).find("HTTP/1.1", 0) != string::npos)
{
#ifdef _DEBUG_JPS
cout << "found HTTP/1.1 " << endl;
#endif
}
if (((*iter).find("200", 0) != string::npos) &&
((*iter).find("OK", 0) != string::npos))
{
successResponded = true;
#ifdef _DEBUG_JPS
cout << "success " << endl;
#endif
}
else if ((*iter).find("Content-Length:", 0) != string::npos)
{
int pos = (*iter).find(":", 14);
if ((pos != (int)string::npos) && (pos < (int)((*iter).size() + 1)))
{
string numStr("");
numStr.assign((*iter), pos + 2, (*iter).size() - pos + 2);
contentLength = strtol(numStr.c_str(), 0, 10);
#ifdef _DEBUG_JPS
cout << "contentLength: " << contentLength << endl;
#endif
}
}
else if ((*iter).find("Transfer-Encoding: chunked", 0) != string::npos)
{
isChunked = true;
contentLength = -1;
#ifdef _DEBUG_JPS
cout << "CHUNKED: contentLength: " << contentLength << endl;
#endif
}
else if ((*iter).find("Content-Type:", 0) != string::npos)
{
if ((*iter).find("text/html", 13) != string::npos)
{
contentType = "text/html";
#ifdef _DEBUG_JPS
cout << "contentType = text/html" << endl;
#endif
}
else //save the Content Type without deeper interpretation!
{
contentType.assign((*iter), 14, (*iter).size() - 14);
}
}
else if ((*iter).find("Connection:", 0) != string::npos)
{ //TODO!
if ((*iter).find("close", 11) != string::npos)
{}
else if ((*iter).find("keep-alive", 11) != string::npos)
{}
}
else if ((*iter).find("Last-Modified: ", 0) != string::npos)
{
gotLastMod = setLastModified(*iter);
}
else if ((*iter).find("Date: ", 0) != string::npos)
{
gotDate = setResponseDate(*iter);
}
}
if (successResponded && ((contentType.size() > 0) || isChunked) && gotDate)
{
if (!gotLastMod) lastModified = responseDate;
__TRACE__
#ifdef _DEBUG_JPS
cout << "parseHTTPResponse E N D E true!" << endl;
#endif
//cout << "serverresponse ende - true:" << endl;
return true;
}
#ifdef _DEBUG_JPS
cout << "parseHTTPResponse E N D E false!" << endl;
#endif
//cout << "serverresponse ende - false:" << endl;
return false;
}
bool Page::HTTPSocket::setResponseDate(string s)
{
responseDate = setDateTime(s);
#ifdef _DEBUG_JPS
cout << "responseDate: " << responseDate.ToString() << endl;
#endif
return true;
}
DateTime Page::HTTPSocket::setDateTime(string s)
{
/*Convert DayName, day monthName year[4 nums] hh:mm:ss GMT to
YEAR-MONTH-DAY-HOUR:MIN:SECOND to store it as an DateTime instance!
*/
DateTime result;
result.SetType(instanttype);
int pos = s.find(",", 0);
int gmtPos = s.find("GMT", 0);
int dateLength = gmtPos - pos - 3;
string dtStr("");
dtStr.assign(s, pos + 2, dateLength);
#ifdef _DEBUG_JPS_4
cout << "dtStr.assign: |" << dtStr << "|" << endl;
#endif
//will be used to create a DateTime string!
string dtFormattedString("");
//..3rd the year..
string dtElem = "";
dtElem.assign(dtStr, 7, 4);
dtFormattedString += dtElem + "-";
#ifdef _DEBUG_JPS_4
cout << "year: |" << dtElem << "|" << endl;
#endif
//..2nd the month..
dtElem = "";
dtElem = getMonthNumFromName(dtStr);
dtFormattedString += dtElem + "-";
#ifdef _DEBUG_JPS_4
cout << "month: |" << dtElem << "|" << endl;
#endif
//1st store the day..
dtElem = "";
dtElem.assign(dtStr, 0, 2);
dtFormattedString += dtElem + "-";
#ifdef _DEBUG_JPS_4
cout << "day: |" << dtElem << "|" << endl;
#endif
//..4th the hour::minutes:seconds
dtElem = "";
dtElem.assign(dtStr, 12, 8);
dtFormattedString += dtElem;
result.ReadFrom(dtFormattedString);
#ifdef _DEBUG_JPS_4
cout << "h:m:s: |" << dtElem << "|" << endl;
cout << "secondo datetime: |" << dtFormattedString << "|" << endl;
cout << "dateTime: " << result.ToString() << endl;
#endif
return result;
}
bool Page::HTTPSocket::setLastModified(string s)
{
lastModified = setDateTime(s);
#ifdef _DEBUG_JPS
cout << "lastModified: " << lastModified.ToString() << endl;
#endif
return true;
}
string Page::HTTPSocket::getMonthNumFromName(string monthName)
{
if (monthName.find("Jan", 0) != std::string::npos) return "1";
else if (monthName.find("Feb", 0) != std::string::npos) return "2";
else if (monthName.find("Mar", 0) != std::string::npos) return "3";
else if (monthName.find("Apr", 0) != std::string::npos) return "4";
else if (monthName.find("May", 0) != std::string::npos) return "5";
else if (monthName.find("Jun", 0) != std::string::npos) return "6";
else if (monthName.find("Jul", 0) != std::string::npos) return "7";
else if (monthName.find("Aug", 0) != std::string::npos) return "8";
else if (monthName.find("Sep", 0) != std::string::npos) return "9";
else if (monthName.find("Oct", 0) != std::string::npos) return "10";
else if (monthName.find("Nov", 0) != std::string::npos) return "11";
else if (monthName.find("Dec", 0) != std::string::npos) return "12";
return "";
}
/*
4 In/Out, Checking Functions and Type Construction of URL
4.1 List Representation and In/Out Functions of ~URL~
Example: The list representation of a URL is
STRING First, text Second, text Third
where First Protocoll i.e. http or ftp
Second Host i.e "//www.google.de"
Third Path i.e. /
*/
ListExpr
OutURL( ListExpr typeInfo, Word value )
{
__TRACE__
// cout << *((URL*)(value.addr)) << endl;
return ((URL*)(value.addr))->ToListExpr(false);
}
Word
InURL( const ListExpr typeInfo, const ListExpr instance,
const int errorPos, ListExpr& errorInfo, bool& correct )
{
__TRACE__
if ( nl->ListLength( instance ) == 3 )
{
ListExpr First = nl->First(instance);
ListExpr Second = nl->Second(instance);
ListExpr Third = nl->Third(instance);
if ( nl->IsAtom(First) && nl->AtomType(First) == StringType
&& nl->IsAtom(Second) && nl->AtomType(Second) == TextType
&& nl->IsAtom(Third) && nl->AtomType(Third) == TextType )
{
string prot = nl->StringValue(First);
string host = nl->Text2String(Second);
string path = nl->Text2String(Third);
{
if( host.length() >= 2 && host[0] == '/' && host[1] == '/')
{
host = host.c_str() + 2;
}
correct = true;
URL* newUrl = new URL(prot, host, path);
return SetWord(newUrl);
}
}
else
{
if( !nl->IsAtom(First)) ErrorReporter::ReportError("First not an atom");
if( !nl->IsAtom(Second)) ErrorReporter::ReportError("Second not an atom");
if( !nl->IsAtom(Third)) ErrorReporter::ReportError("Third not an atom");
if (!(nl->AtomType(First) == StringType))
ErrorReporter::ReportError("First not a StringType");
if (!(nl->AtomType(Second) == TextType))
ErrorReporter::ReportError("Second not a TextType");
if (!(nl->AtomType(Third) == TextType))
ErrorReporter::ReportError("Third not a TextType");
correct = false;
return SetWord(Address(0));
}
}
ErrorReporter::ReportError("Wrong number of"
" params, expecting protocol,host,path");
correct = false;
return SetWord(Address(0));
}
Word
CreateURL( const ListExpr typeInfo )
{
__TRACE__
return (SetWord( new URL( "http://" ) ));
}
void
DeleteURL( const ListExpr typeInfo, Word& w )
{
__TRACE__
// ((URL*)w.addr)->destroy();
delete (URL *)w.addr;
w.addr = 0;
}
void
CloseURL( const ListExpr typeInfo, Word& w )
{
__TRACE__
delete (URL *)w.addr;
// w.addr = 0;
}
Word
CloneURL( const ListExpr typeInfo, const Word& w )
{
__TRACE__
return SetWord( ((URL *)w.addr)->Clone() );
}
int
SizeOfURL()
{
__TRACE__
return sizeof(URL);
}
/*
4.2 Kind Checking Function and Property of ~URL~
This function checks whether the type constructor is applied correctly.
*/
bool
CheckURL( ListExpr type, ListExpr& errorInfo )
{
__TRACE__
return (nl->IsEqual( type, URL::BasicType() ));
}
ListExpr
URLProperty()
{
__TRACE__
return (nl->TwoElemList(
nl->FiveElemList(nl->StringAtom("Signature"),
nl->StringAtom("Example Type List"),
nl->StringAtom("List Rep"),
nl->StringAtom("Example List"),
nl->StringAtom("Remarks")),
nl->FiveElemList(nl->StringAtom("-> DATA"),
nl->StringAtom(URL::BasicType()),
nl->StringAtom("(<protocol> <host> <path>)"),
nl->StringAtom("(http //dict.leo.org /)"),
nl->StringAtom("prot.: STRING<46 bytes, host, path"
"type text."))));
}
void* CastURL( void* addr ) {return (new (addr) URL);}
/*
4.3 Creation of the Type Constructor Instance of ~URL~
*/
TypeConstructor url( URL::BasicType(),
URLProperty,
OutURL, InURL,
0, 0,
CreateURL, DeleteURL,
OpenAttribute<URL>, SaveAttribute<URL>,
CloseURL, CloneURL,
CastURL, SizeOfURL,
CheckURL );
/*
5 In/Out, Checking Functions and Type Construction of HTML
5.1 List Representation and In/Out Functions of ~HTML~
Example: The list representation of a HTML is
Listenformat: ( datetime text url )
Atribute: LastChange, source, sourceURL
Example:
----
let html1 = [const html value ((instant (10 10 2006 10 27 18)) <text>test</text---> (url ("http" <text>www.xx.de</text---> <text>/</text---> )))]
----
*/
ListExpr
OutHTML( ListExpr typeInfo, Word value )
{
__TRACE__
return ((HTML*)(value.addr))->ToListExpr(false);
}
Word
InHTML( const ListExpr typeInfo, const ListExpr instance,
const int errorPos, ListExpr& errorInfo, bool& correct )
{
__TRACE__
if ( nl->ListLength( instance ) == 3 )
{
ListExpr First = nl->First(instance); //DateTime
ListExpr Second = nl->Second(instance); //Text (FLOB)
ListExpr Third = nl->Third(instance); //URL
if ( nl->ListLength( First ) == 2
&& nl->IsEqual(nl->First(First), Instant::BasicType())
&& nl->IsAtom(Second) && nl->AtomType(Second) == TextType
&& nl->ListLength( Third ) == 2
&& nl->IsEqual(nl->First(Third), URL::BasicType()))
{
DateTime date(instanttype);
date.ReadFrom(First,true);
string text = nl->Text2String(Second);
// cout << "Text: " << text << endl;
__TRACE__
Base64 b;
int sizeDecoded = b.sizeDecoded( text.size() );
char *bytes = (char *)malloc( sizeDecoded + 1);
int result = b.decode( text, bytes );
assert( result <= sizeDecoded );
bytes[result] = 0;
//cout << "Size: " << result << endl;
//cout << "Dekodiert: " << bytes << endl;
text = bytes;
free( bytes );
//cout << "Text: " << text << endl;
//cout << "Size Text: " << text.size() << endl;
__TRACE__
correct = true;
//string out;
//nl->WriteToString(out, Third);
//cout << "Typ Third: " << out << endl;
Word u = InURL( Third, nl->Second(Third),errorPos,errorInfo, correct );
URL *url;
if( correct)
{
url = (URL*)u.addr;
{
//cout << " in html " << url->IsDefined() << endl;
HTML* newHtml = new HTML(date, text, *url);
return SetWord(newHtml);
}
}
else
{
ErrorReporter::ReportError("Error in reading url in InHTML");
return SetWord(Address(0));
}
}
else
{
__TRACE__
if( nl->ListLength( First ) != 2 )
ErrorReporter::ReportError("First not an list of length 2");
else if( !nl->IsAtom(Second))
ErrorReporter::ReportError("Second not an atom");
else if( nl->ListLength( Third ) != 2)
ErrorReporter::ReportError("Third not a list of length 2");
else if (!(nl->IsEqual(nl->First(First), Instant::BasicType())))
ErrorReporter::ReportError("First not an instant");
else if (!(nl->AtomType(Second) == TextType))
ErrorReporter::ReportError("Second not a TextType");
else //if (!(nl->IsEqual(nl->First(Third), URL::BasicType())))
ErrorReporter::ReportError("Third not a url");
correct = false;
return SetWord(Address(0));
}
}
__TRACE__
ErrorReporter::ReportError("Wrong number of params, expecting"
" lastModified,source,sourceUrl");
correct = false;
return SetWord(Address(0));
}
Word
CreateHTML( const ListExpr typeInfo )
{
__TRACE__
return (SetWord( new HTML( "" ) ));
}
void
DeleteHTML( const ListExpr typeInfo, Word& w )
{
__TRACE__
delete (HTML *)w.addr;
w.addr = 0;
}
void
CloseHTML( const ListExpr typeInfo, Word& w )
{
__TRACE__
delete (HTML *)w.addr;
w.addr = 0;
}
Word
CloneHTML( const ListExpr typeInfo, const Word& w )
{
__TRACE__
return SetWord( ((HTML *)w.addr)->Clone() );
}
int
SizeOfHTML()
{
__TRACE__
return sizeof(HTML);
}
/*
5.2 Kind Checking Function and Property of ~HTML~
This function checks whether the type constructor is applied correctly.
*/
bool
CheckHTML( ListExpr type, ListExpr& errorInfo )
{
__TRACE__
return (nl->IsEqual( type, HTML::BasicType() ));
}
ListExpr
HTMLProperty()
{
__TRACE__
return (nl->TwoElemList(
nl->FiveElemList(nl->StringAtom("Signature"),
nl->StringAtom("Example Type List"),
nl->StringAtom("List Rep"),
nl->StringAtom("Example List"),
nl->StringAtom("Remarks")),
nl->FiveElemList(nl->StringAtom("-> DATA"),
nl->StringAtom(HTML::BasicType()),
nl->StringAtom("(<datetime: lastchange><text source> <url>)"),
nl->StringAtom("(list representation)"),
nl->StringAtom("url has the type url"))));
}
void* CastHTML( void* addr ) {return (new (addr) HTML);}
/*
5.3 Creation of the Type Constructor Instance of ~HTML~
*/
TypeConstructor html( HTML::BasicType(),
HTMLProperty,
OutHTML, InHTML,
0, 0,
CreateHTML, DeleteHTML,
OpenAttribute<HTML>, SaveAttribute<HTML>,
CloseHTML, CloneHTML,
CastHTML, SizeOfHTML,
CheckHTML );
/*
6 In/Out, Checking Functions and Type Construction of Page
5.1 List Representation and In/Out Functions of ~Page~
Example: The list representation of a Page is
Listenformat: (html (url text string)*)
Atribute: html wird geerbt , (EmbededURL binFile mime)*
Example:
----
see at the top of the class Page
----
*/
ListExpr
OutPage( ListExpr typeInfo, Word value )
{
__TRACE__
Page* pPage = (Page*)(value.addr);
int noObjects = pPage->numOfFiles();
ListExpr pageList = nl->OneElemList(((HTML*)pPage)->ToListExpr(true));
ListExpr pageStart = pageList;
for( int ii=0; ii<noObjects; ii++)
{
__TRACE__
pageList = nl->Append( pageList, nl->ThreeElemList(
pPage->getUrl(ii).ToListExpr(true),
nl->TextAtom(pPage->getText( ii)),
nl->StringAtom(pPage->getMime( ii))));
}
__TRACE__
return pageStart;
}
Word
InPage( const ListExpr typeInfo, const ListExpr instance,
const int errorPos, ListExpr& errorInfo, bool& correct )
{
__TRACE__
if ( nl->ListLength( instance ) >= 1
&& nl->ListLength( nl->First(instance) ) == 2
&& nl->IsEqual(nl->First(nl->First(instance)), HTML::BasicType()))
{
ListExpr First = nl->First(instance); //html
int nrOfEmb = nl->ListLength(instance) - 1;
correct = true;
Word h = InHTML( First, nl->Second(First),errorPos,errorInfo, correct );
if( correct)
{
HTML *html = (HTML*)h.addr;
Page *newpage = new Page(*html);
First = nl->Rest(instance);
//now lists of (url text string)
for( int ii=0; ii < nrOfEmb; ii++)
{
ListExpr emblist = nl->First(First);
First = nl->Rest(First);
if ( nl->ListLength( emblist ) == 3
&& nl->IsEqual(nl->First(nl->First(emblist)), URL::BasicType())
&& nl->IsAtom(nl->Second(emblist))
&& nl->AtomType(nl->Second(emblist)) == TextType
&& nl->IsAtom(nl->Third(emblist))
&& nl->AtomType(nl->Third(emblist)) == StringType)
{
Word u = InURL( nl->First(emblist),
nl->Second(nl->First(emblist)),errorPos,errorInfo, correct );
if( correct)
{
URL *url = (URL*)u.addr;
string text = nl->Text2String(nl->Second(emblist));
string mime = nl->StringValue(nl->Third(emblist));
newpage->addEmbObject(*url,mime,text);
delete url;
url = NULL;
}
else
{
__TRACE__
ErrorReporter::ReportError("emb obj has not"
" the right list structure");
return SetWord(Address(0));
}
}
else
{
__TRACE__
correct = false;
return SetWord(Address(0));
}
}
return SetWord(newpage);
}
else
{
__TRACE__
ErrorReporter::ReportError("page has no correct html as first element");
return SetWord(Address(0));
}
}
__TRACE__
ErrorReporter::ReportError("Wrong number of params or not a html"
" as first, expecting html,(url,text, string)*");
correct = false;
return SetWord(Address(0));
}
Word
CreatePage( const ListExpr typeInfo )
{
__TRACE__
return (SetWord( new Page( "" ) ));
}
void
DeletePage( const ListExpr typeInfo, Word& w )
{
__TRACE__
delete (Page *)w.addr;
w.addr = 0;
}
void
ClosePage( const ListExpr typeInfo, Word& w )
{
__TRACE__
delete (Page *)w.addr;
w.addr = 0;
}
Word
ClonePage( const ListExpr typeInfo, const Word& w )
{
__TRACE__
return SetWord( ((Page *)w.addr)->Clone() );
}
int
SizeOfPage()
{
__TRACE__
return sizeof(Page);
}
/*
5.2 Kind Checking Function and Property of ~Page~
This function checks whether the type constructor is applied correctly.
*/
bool
CheckPage( ListExpr type, ListExpr& errorInfo )
{
__TRACE__
return (nl->IsEqual( type, Page::BasicType() ));
}
ListExpr
PageProperty()
{
__TRACE__
return (nl->TwoElemList(
nl->FiveElemList(nl->StringAtom("Signature"),
nl->StringAtom("Example Type List"),
nl->StringAtom("List Rep"),
nl->StringAtom("Example List"),
nl->StringAtom("Remarks")),
nl->FiveElemList(nl->StringAtom("-> DATA"),
nl->StringAtom(Page::BasicType()),
nl->StringAtom("(<html>(<url text string>)*)"),
nl->StringAtom("(list representation)"),
nl->StringAtom("<url text mimetype> are the embedded objects"))));
}
void* CastPage( void* addr ) {return (new (addr) Page);}
/*
5.3 Creation of the Type Constructor Instance of ~Page~
*/
TypeConstructor page( Page::BasicType(),
PageProperty,
OutPage, InPage,
0, 0,
CreatePage, DeletePage,
OpenAttribute<Page>, SaveAttribute<Page>,
ClosePage, ClonePage,
CastPage, SizeOfPage,
CheckPage );
/*
6 Creating Operators
6.1.1 Type Mapping of Operator ~protocol,host,filename~
*/
ListExpr
protocolHostFilenameTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 1 )
{
ListExpr arg1 = nl->First(args);
if ( nl->IsEqual(arg1, URL::BasicType()) )
return nl->SymbolAtom(FText::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.1 Type Mapping of Operator ~source~
*/
ListExpr
sourceTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 1 )
{
ListExpr arg1 = nl->First(args);
if ( nl->IsEqual(arg1, HTML::BasicType()) ||
nl->IsEqual(arg1,Page::BasicType()))
return nl->SymbolAtom(URL::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.2 Type Mapping of Operator ~createurl~
*/
ListExpr
createurlTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 1 )
{
ListExpr arg1 = nl->First(args);
if ( nl->IsEqual(arg1, FText::BasicType()))
return nl->SymbolAtom(URL::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.3 Type Mapping of Operator ~content~
*/
ListExpr
contentTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 1 )
{
ListExpr arg1 = nl->First(args);
if ( nl->IsEqual(arg1, HTML::BasicType()))
return nl->SymbolAtom(FText::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.4 Type Mapping of Operator ~urls~
*/
ListExpr
urlsTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 1 )
{
ListExpr arg1 = nl->First(args);
if ( nl->IsEqual(arg1, HTML::BasicType()) ||
nl->IsEqual(arg1,Page::BasicType()))
return nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()),
nl->SymbolAtom(URL::BasicType()));
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.5 Type Mapping of Operator ~containsurl~
*/
ListExpr
containsurlTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 2 )
{
ListExpr arg1 = nl->First(args);
ListExpr arg2 = nl->Second(args);
if ( (nl->IsEqual(arg1, HTML::BasicType()) ||
nl->IsEqual(arg1,Page::BasicType()))
&& nl->IsEqual(arg2,URL::BasicType()))
return nl->SymbolAtom(CcBool::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.6 Type Mapping of Operator ~last_modified~
----
----
*/
ListExpr
lastmodifiedTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 1 )
{
ListExpr arg1 = nl->First(args);
if ( nl->IsEqual(arg1, HTML::BasicType()))
return nl->SymbolAtom(Instant::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.7 Type Mapping of Operator ~metainfo~
*/
ListExpr
metainfoTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 2 )
{
ListExpr arg1 = nl->First(args);
ListExpr arg2 = nl->Second(args);
if ( nl->IsEqual(arg1, HTML::BasicType()) &&
nl->IsEqual(arg2,CcString::BasicType()))
return nl->SymbolAtom(FText::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.8 Type Mapping of Operator ~metainfos~
*/
ListExpr
metainfosTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 1 )
{
ListExpr arg1 = nl->First(args);
if ( nl->IsEqual(arg1, HTML::BasicType()))
{
ListExpr attrList =
nl->OneElemList(nl->TwoElemList(nl->SymbolAtom("Key"),
nl->SymbolAtom(CcString::BasicType())));
nl->Append(attrList,nl->TwoElemList(nl->SymbolAtom("Content"),
nl->SymbolAtom(FText::BasicType())));
return nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()),
nl->TwoElemList(nl->SymbolAtom(Tuple::BasicType()),attrList));
}
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.9 Type Mapping of Operator ~number_of~
*/
ListExpr
numberofTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 2 )
{
ListExpr arg1 = nl->First(args);
ListExpr arg2 = nl->Second(args);
if ( nl->IsEqual(arg1, HTML::BasicType()) &&
nl->IsEqual(arg2,CcString::BasicType()))
return nl->SymbolAtom(CcInt::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.10 Type Mapping of Operator ~similar~
*/
ListExpr
similarTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 4 )
{
ListExpr arg1 = nl->First(args);
ListExpr arg2 = nl->Second(args);
ListExpr arg3 = nl->Third(args);
ListExpr arg4 = nl->Fourth(args);
if ( nl->IsEqual(arg1, HTML::BasicType()) &&
nl->IsEqual(arg2,HTML::BasicType())
&& nl->IsEqual(arg3,CcInt::BasicType()) &&
nl->IsEqual(arg4,CcBool::BasicType()))
return nl->SymbolAtom(CcReal::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.11 Type Mapping of Operator ~extracthtml~
*/
ListExpr
extracthtmlTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 1 )
{
ListExpr arg1 = nl->First(args);
if ( nl->IsEqual(arg1, Page::BasicType()))
return nl->SymbolAtom(HTML::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.12 Type Mapping of Operator ~numoffiles~
*/
ListExpr
numoffilesTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 1 )
{
ListExpr arg1 = nl->First(args);
if ( nl->IsEqual(arg1, Page::BasicType()))
return nl->SymbolAtom(CcInt::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.13 Type Mapping of Operator ~getfiles~
*/
ListExpr
getfilesTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 1 )
{
ListExpr arg1 = nl->First(args);
if ( nl->IsEqual(arg1, Page::BasicType()))
{
ListExpr attrList =
nl->OneElemList(nl->TwoElemList(nl->SymbolAtom("Source"),
nl->SymbolAtom(URL::BasicType())));
ListExpr lastAttrList = attrList;
lastAttrList =
nl->Append(lastAttrList,nl->TwoElemList(nl->SymbolAtom("Type"),
nl->SymbolAtom(CcString::BasicType())));
nl->Append(lastAttrList,nl->TwoElemList(nl->SymbolAtom("File"),
nl->SymbolAtom(BinaryFile::BasicType())));
return nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()),
nl->TwoElemList(nl->SymbolAtom(Tuple::BasicType()),attrList));
}
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.14 Type Mapping of Operator ~wget~
*/
ListExpr
wgetTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 4 || nl->ListLength(args) == 5 )
{
ListExpr arg1 = nl->First(args);
ListExpr arg2 = nl->Second(args);
ListExpr arg3 = nl->Third(args);
ListExpr arg4 = nl->Fourth(args);
if( nl->ListLength(args) == 5 )
{
ListExpr arg5 = nl-> Fifth(args);
if (nl->IsAtom(arg5)
|| nl->ListLength(arg5) != 3
|| !nl->IsEqual(nl->First(arg5), Symbol::MAP())
|| !nl->IsEqual(nl->Second(arg5), URL::BasicType())
|| !nl->IsEqual(nl->Third(arg5), CcBool::BasicType()) )
{
string out;
nl->WriteToString(out, arg5);
ErrorReporter::ReportError("Operator wget expects a "
"(map -> bool) as its fifth argument. "
"The second argument provided "
"has type '" + out + "' instead.");
return nl->SymbolAtom(Symbol::TYPEERROR());
}
}
__TRACE__
if ( nl->IsEqual(arg1, URL::BasicType()) &&
nl->IsEqual(arg2, CcBool::BasicType())
&& nl->IsEqual(arg3, CcInt::BasicType()) &&
nl->IsEqual(arg4, FText::BasicType()))
{
__TRACE__
ListExpr attrList =
nl->OneElemList(nl->TwoElemList(nl->SymbolAtom("Source"),
nl->SymbolAtom(URL::BasicType())));
ListExpr lastAttrList = attrList;
lastAttrList =
nl->Append(lastAttrList,nl->TwoElemList(nl->SymbolAtom("Type"),
nl->SymbolAtom(CcString::BasicType())));
nl->Append(lastAttrList,nl->TwoElemList(nl->SymbolAtom("File"),
nl->SymbolAtom(BinaryFile::BasicType())));
return nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()),
nl->TwoElemList(nl->SymbolAtom(Tuple::BasicType()),attrList));
}
}
__TRACE__
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.15 Type Mapping of Operator ~pageget~
*/
ListExpr
pagegetTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 4 || nl->ListLength(args) == 5 )
{
ListExpr arg1 = nl->First(args);
ListExpr arg2 = nl->Second(args);
ListExpr arg3 = nl->Third(args);
ListExpr arg4 = nl->Fourth(args);
if( nl->ListLength(args) == 5 )
{
ListExpr arg5 = nl->Fifth(args);
if (nl->IsAtom(arg5)
|| nl->ListLength(arg5) != 3
|| !nl->IsEqual(nl->First(arg5), Symbol::MAP())
|| !nl->IsEqual(nl->Second(arg5), URL::BasicType())
|| !nl->IsEqual(nl->Third(arg5), CcBool::BasicType()) )
{
string out;
nl->WriteToString(out, arg5);
ErrorReporter::ReportError("Operator pageget expects a "
"(map -> bool) as its fifth argument. "
"The second argument provided "
"has type '" + out + "' instead.");
return nl->SymbolAtom(Symbol::TYPEERROR());
}
}
__TRACE__
if ( nl->IsEqual(arg1, URL::BasicType()) &&
nl->IsEqual(arg2, CcBool::BasicType())
&& nl->IsEqual(arg3, CcInt::BasicType()) &&
nl->IsEqual(arg4, FText::BasicType()))
{
__TRACE__
ListExpr attrList =
nl->OneElemList(nl->TwoElemList(nl->SymbolAtom("Source"),
nl->SymbolAtom(URL::BasicType())));
nl->Append(attrList,nl->TwoElemList(nl->SymbolAtom("Page"),
nl->SymbolAtom(Page::BasicType())));
return nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()),
nl->TwoElemList(nl->SymbolAtom(Tuple::BasicType()),attrList));
}
}
__TRACE__
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.15 Type Mapping of Operator ~htmlget~
*/
ListExpr
htmlgetTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 4 || nl->ListLength(args) == 5 )
{
ListExpr arg1 = nl->First(args);
ListExpr arg2 = nl->Second(args);
ListExpr arg3 = nl->Third(args);
ListExpr arg4 = nl->Fourth(args);
if( nl->ListLength(args) == 5 )
{
ListExpr arg5 = nl->Fifth(args);
if (nl->IsAtom(arg5)
|| nl->ListLength(arg5) != 3
|| !nl->IsEqual(nl->First(arg5), Symbol::MAP())
|| !nl->IsEqual(nl->Second(arg5), URL::BasicType())
|| !nl->IsEqual(nl->Third(arg5), CcBool::BasicType()) )
{
string out;
nl->WriteToString(out, arg5);
ErrorReporter::ReportError("Operator htmlget expects a "
"(map -> bool) as its fifth argument. "
"The second argument provided "
"has type '" + out + "' instead.");
return nl->SymbolAtom(Symbol::TYPEERROR());
}
}
__TRACE__
if ( nl->IsEqual(arg1, URL::BasicType()) &&
nl->IsEqual(arg2, CcBool::BasicType())
&& nl->IsEqual(arg3, CcInt::BasicType()) &&
nl->IsEqual(arg4, FText::BasicType()))
{
__TRACE__
ListExpr attrList =
nl->OneElemList(nl->TwoElemList(nl->SymbolAtom("Source"),
nl->SymbolAtom(URL::BasicType())));
nl->Append(attrList,nl->TwoElemList(nl->SymbolAtom("Html"),
nl->SymbolAtom(HTML::BasicType())));
return nl->TwoElemList(nl->SymbolAtom(Symbol::STREAM()),
nl->TwoElemList(nl->SymbolAtom(Tuple::BasicType()),attrList));
}
}
__TRACE__
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.1.16 Type Mapping of Operator ~webequal =:~
*/
ListExpr
webequalTypeMap( ListExpr args)
{
__TRACE__
if ( nl->ListLength(args) == 2 )
{
ListExpr arg1 = nl->First(args);
ListExpr arg2 = nl->Second(args);
if ( (nl->IsEqual(arg1, URL::BasicType()) &&
nl->IsEqual(arg2,URL::BasicType()))
|| (nl->IsEqual(arg1, HTML::BasicType()) &&
nl->IsEqual(arg2,HTML::BasicType()))
|| (nl->IsEqual(arg1, Page::BasicType())&&
nl->IsEqual(arg2,Page::BasicType())))
return nl->SymbolAtom(CcBool::BasicType());
}
return nl->SymbolAtom(Symbol::TYPEERROR());
}
/*
6.2 Value Mapping and Selection Functions
6.2.1 Value Mapping Function for Operator ~protocol~
*/
int
protocolFun (Word* args, Word& result, int message, Word& local, Supplier s)
{
__TRACE__
URL* u = ((URL*)args[0].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((FText*)result.addr)->Set(true, u->getProtocol().c_str());
//the first argument says the
//value is defined, the second is the
//real value)
return 0;
}
/*
6.2.2 Value Mapping Function for Operator ~host~
*/
int
hostFun (Word* args, Word& result, int message, Word& local, Supplier s)
{
__TRACE__
URL* u = ((URL*)args[0].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((FText*)result.addr)->Set(true, u->getHost().c_str());
//the first argument says the
//value is defined, the second is the
//real value)
return 0;
}
/*
6.2.3 Value Mapping Function for Operator ~filename~
*/
int
filenameFun (Word* args, Word& result, int message, Word& local, Supplier s)
{
__TRACE__
URL* u = ((URL*)args[0].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((FText*)result.addr)->Set(true, u->getPath().c_str());
//the first argument says the boolean
//value is defined, the second is the
//real value)
return 0;
}
/*
6.2.4 Value Mapping Function for Operator ~source~
*/
int
sourceFun (Word* args, Word& result, int message, Word& local, Supplier s)
{
__TRACE__
HTML* h = ((HTML*)args[0].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
URL *u = new URL(h->getSource());
__TRACE__
((URL*)result.addr)->Set(true, *u);
//the first argument says the boolean
//value is defined, the second is the
//real value)
__TRACE__
delete u;
return 0;
}
/*
6.2.5 Value Mapping Function for Operator ~createurl~
*/
int
createurlFun (Word* args, Word& result, int message, Word& local, Supplier s)
{
__TRACE__
FText* t = ((FText*)args[0].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
const char *str = t->Get();
URL u("");
string sUrl = str;
bool erg = URL::urlFromString(sUrl,u);
//the function has to return a url. From every string
//it has to return a valid url
((URL*)result.addr)->Set(erg, u);
//the first argument says the
//value is defined, the second is the
//real value)
return 0;
}
/*
6.2.6 Value Mapping Function for Operator ~content~
*/
int
contentFun (Word* args, Word& result, int message, Word& local, Supplier s)
{
__TRACE__
HTML* h = ((HTML*)args[0].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((FText*)result.addr)->Set(true, h->getText().c_str());
//the first argument says the
//value is defined, the second is the
//real value)
return 0;
}
/*
6.2.7 Value Mapping Function for Operator ~urls~
*/
int
urlsFun (Word* args, Word& result, int message, Word& local, Supplier s)
{
__TRACE__
HTML* h = ((HTML*)args[0].addr);
struct UrlAdvance {int numberOf, current;}* urladvance;
switch( message )
{
case OPEN:
urladvance = new UrlAdvance;
urladvance->current = 0;
urladvance->numberOf = h->getNumberOfUrls();
local.addr = urladvance;
return 0;
case REQUEST:
urladvance = ((UrlAdvance*) local.addr);
if ( urladvance->current < urladvance->numberOf )
{
URL *elem = new URL((h->getUrl(urladvance->current++)));
result.addr = elem;
return YIELD;
}
else return CANCEL;
case CLOSE:
urladvance = ((UrlAdvance*) local.addr);
delete urladvance;
return 0;
}
/* should not happen */
return -1;
}
/*
6.2.8 Value Mapping Function for Operator ~containsurl~
*/
int
containsurlFun (Word* args, Word& result, int message,
Word& local, Supplier s)
{
__TRACE__
HTML* h = ((HTML*)args[0].addr);
URL* u = ((URL*)args[1].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((CcBool*)result.addr)->Set(true, h->containsURL(u));
//the first argument says the boolean
//value is defined, the second is the
//real value)
return 0;
}
/*
6.2.9 Value Mapping Function for Operator ~lastmodified~
*/
int
lastmodifiedFun (Word* args, Word& result, int message,
Word& local, Supplier s)
{
__TRACE__
HTML* h = ((HTML*)args[0].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
DateTime d = h->getLastModified();
((DateTime*)result.addr)->Set(d.GetYear(),d.GetMonth(), d.GetGregDay(),
d.GetHour(), d.GetMinute(), d.GetSecond(),d.GetMillisecond());
return 0;
}
/*
6.2.10 Value Mapping Function for Operator ~metainfo~
*/
int
metainfoFun (Word* args, Word& result, int message, Word& local, Supplier s)
{
__TRACE__
HTML* h = ((HTML*)args[0].addr);
string key = StdTypes::GetString(args[1]);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((FText*)result.addr)->Set(true, h->getMetaInfo(key).c_str());
return 0;
}
/*
6.2.11 Value Mapping Function for Operator ~metainfos~
*/
int
metainfosFun (Word* args, Word& result, int message, Word& local, Supplier s)
{
__TRACE__
HTML* h = ((HTML*)args[0].addr);
struct MiAdvance {int numberOf, current;
TupleType *resultTupleType;}* miAdvance;
ListExpr resultType;
switch( message )
{
case OPEN:
miAdvance = new MiAdvance;
miAdvance->current = 0;
miAdvance->numberOf = h->getNumberOfMetainfos();
resultType = GetTupleResultType( s );
miAdvance->resultTupleType = new TupleType( nl->Second( resultType ));
local.addr = miAdvance;
return 0;
case REQUEST:
miAdvance = ((MiAdvance*) local.addr);
if ( miAdvance->current < miAdvance->numberOf )
{
string content;
string key = h->getMetainfo(miAdvance->current++,content);
//make tuple [Key: string, Content: text]
Tuple *elem = new Tuple( miAdvance->resultTupleType );
STRING_T skey;
strcpy(skey, key.c_str());
CcString* cckey = new CcString(true,&skey);
elem->PutAttribute(0,cckey);
FText *t = new FText(true,content.c_str());
elem->PutAttribute(1,t);
result.addr = elem;
return YIELD;
}
else return CANCEL;
case CLOSE:
miAdvance = ((MiAdvance*) local.addr);
miAdvance->resultTupleType->DeleteIfAllowed();
delete miAdvance;
return 0;
}
/* should not happen */
return -1;
}
/*
6.2.12 Value Mapping Function for Operator ~numberof~
*/
int
numberofFun (Word* args, Word& result, int message, Word& local, Supplier s)
{
__TRACE__
HTML* h = ((HTML*)args[0].addr);
string key = StdTypes::GetString(args[1]);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((CcInt*)result.addr)->Set(true, h->getNumberOf(key));
return 0;
}
/*
6.2.13 Value Mapping Function for Operator ~similar~
*/
int
similarFun (Word* args, Word& result, int message, Word& local, Supplier s)
{
__TRACE__
HTML* h1 = ((HTML*)args[0].addr);
HTML* h2 = ((HTML*)args[1].addr);
int tiefe = StdTypes::GetInt(args[2]);
bool doFollowOrder = StdTypes::GetBool(args[3]);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((CcReal*)result.addr)->Set(true, h1->similar(h2,tiefe,doFollowOrder));
__TRACE__
return 0;
}
/*
6.2.14 Value Mapping Function for Operator ~extracthtml~
*/
int
extracthtmlFun (Word* args, Word& result, int message,
Word& local, Supplier s)
{
__TRACE__
Page* p = ((Page*)args[0].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
HTML h( p->extractHTML());
((HTML*)result.addr)->Set(h);
return 0;
}
/*
6.2.15 Value Mapping Function for Operator ~numoffiles~
*/
int
numoffilesFun (Word* args, Word& result, int message,
Word& local, Supplier s)
{
__TRACE__
Page* p = ((Page*)args[0].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((CcInt*)result.addr)->Set(true, p->numOfFiles());
return 0;
}
/*
6.2.16 Value Mapping Function for Operator ~getfiles~
*/
int
getfilesFun (Word* args, Word& result, int message,
Word& local, Supplier s)
{
__TRACE__
ListExpr resultType;
Page* p = ((Page*)args[0].addr);
struct EmbAdvance {int numberOf, current;
TupleType *resultTupleType;}* embAdvance;
switch( message )
{
case OPEN:
embAdvance = new EmbAdvance;
embAdvance->current = 0;
embAdvance->numberOf = p->numOfFiles();
resultType = GetTupleResultType( s );
embAdvance->resultTupleType = new TupleType( nl->Second( resultType ) );
local.addr = embAdvance;
return 0;
case REQUEST:
embAdvance = ((EmbAdvance*) local.addr);
if ( embAdvance->current < embAdvance->numberOf )
{
URL *u = new URL((p->getUrl(embAdvance->current)));
string type = p->getMime( embAdvance->current);
string src = p->getText( embAdvance->current++);
//make tuple [Source: url, Type: string, File: binfile]
Tuple *elem = new Tuple( embAdvance->resultTupleType );
elem->PutAttribute(0,u);
STRING_T stype;
strcpy(stype, type.c_str());
CcString* cctype = new CcString(true,&stype);
elem->PutAttribute(1,cctype);
//BinaryFile *file = new BinaryFile( src.length()+1 );
//file->Put(0,src.length()+1,src.c_str());
BinaryFile *file = new BinaryFile( 0 );
file->Decode(src);
elem->PutAttribute(2,file);
result.addr = elem;
return YIELD;
}
else return CANCEL;
case CLOSE:
embAdvance = ((EmbAdvance*) local.addr);
embAdvance->resultTupleType->DeleteIfAllowed();
delete embAdvance;
return 0;
}
/* should not happen */
return -1;
}
/*
6.2.17.1 class definitions for hashtable for operators wget and pageget
*/
class HashUrl
{
private:
static const size_t NO_BUCKETS = 50;//255;
size_t nBuckets;
vector<vector< string*> > *bucketsU;
size_t GetHashVal(string* s)
{
int size = 0;
for( unsigned int i = 0; i < s->length(); i++)
{
size += (*s)[i];
}
return size % nBuckets;
}
void ClearBucketsU()
{
vector< vector<string*> >::iterator iterBuckets = bucketsU->begin();
while(iterBuckets != bucketsU->end() )
{
vector<string*>::iterator iter = (*iterBuckets).begin();
while(iter != (*iterBuckets).end())
{
delete *iter;
iter++;
}
iterBuckets++;
}
}
public:
HashUrl()
{
nBuckets = NO_BUCKETS;
bucketsU = new vector< vector< string*> >(nBuckets);
}
~HashUrl()
{
ClearBucketsU();
}
bool IsDuplicate( string &s)
{
//prüft ob sring schon im Hash ist
//Wenn ja wird true returnt,
//sonst false und der übergeb.String wird eingefügt
char* str = new char[s.length() + 1];
char *pstr = str;
const char* ps = s.c_str();
while ((*pstr++ = toupper(*ps++)) != 0);
string *hashstring = new string(str);
delete[] str;
size_t hashVal = GetHashVal(hashstring);
//cout << "Wert: " << hashVal << "Hash: " << *hashstring << endl;
vector<string*>::iterator iter = (*bucketsU)[hashVal].begin();
while(iter != (*bucketsU)[hashVal].end())
{
//cout << "iter: " << **iter << endl;
if( **iter == *hashstring)
{
return true; //Die Strings sind gleich
}
iter++;
}
//hier daher kein gleiches gefunden
(*bucketsU)[hashVal].push_back(hashstring);
return false;
}
};
/*
6.2.19 Selection functions for Operator ~wget, pageget, htmlget~
*/
int webwget_pagegetSelect( ListExpr args)
{
if ( nl->ListLength(args) == 4 )
return(0);
if ( nl->ListLength(args) == 5 )
return(1);
return(-1); //This point should never be reached
}
/*
6.2.17 Value Mapping Functions for Operator ~wget~
*/
struct PageAdvance {int numberOfEmb, currentEmb,
numberOfLinks,currentLink; Page *p;};
int
wgetFun (Word* args, Word& result, int message, Word& local, Supplier s,
bool hasFunction)
{
ListExpr resultType;
struct GetAdvance {stack<PageAdvance*>* myDepthStack;
HashUrl *myHash; TupleType *resultTupleType;
int depth; bool isnew;; string *host;}* getAdvance;
__TRACE__
switch( message )
{
case OPEN:
{
__TRACE__
getAdvance = new GetAdvance;
getAdvance->myHash = new HashUrl;
getAdvance->myDepthStack = new stack<PageAdvance*>;
resultType = GetTupleResultType( s );
getAdvance->resultTupleType = new TupleType( nl->Second( resultType ) );
getAdvance->depth = 0;
getAdvance->isnew = true;
FText* t = ((FText*)args[3].addr);
URL* u = ((URL*)args[0].addr);
string s = t->Get();
if( s.length() > 0)
{
getAdvance->host = new string(u->getHost() + "," + t->Get());
}
else
{
getAdvance->host = new string(u->getHost());
}
local.addr = getAdvance;
}
return 0;
case REQUEST:
//cout << "In wget Request" << endl;
__TRACE__
{
getAdvance = ((GetAdvance*) local.addr);
PageAdvance *pa = NULL;
bool extLinks = StdTypes::GetBool(args[1]);
int depth = StdTypes::GetInt(args[2]);
bool isUnlimited = (depth < 0);
URL *exturl = NULL;
if( !getAdvance->myDepthStack->empty() )
{
pa = getAdvance->myDepthStack->top();
}
while( !exturl && pa)
{
__TRACE__
while ( pa && pa->currentEmb < pa->numberOfEmb )
{
__TRACE__
URL *u = new URL((pa->p->getUrl(pa->currentEmb)));
string type = pa->p->getMime( pa->currentEmb);
string src = pa->p->getText( pa->currentEmb++);
string hashstring = u->getProtocol() + ":"
+ u->getHost() + u->getPath();
if( !getAdvance->myHash->IsDuplicate(hashstring) )
{
cout << *u << endl;
//make tuple [Source: url, Type: string, File: binfile]
Tuple *elem = new Tuple( getAdvance->resultTupleType );
elem->PutAttribute(0,u);
STRING_T stype;
strcpy(stype, type.c_str());
CcString* cctype = new CcString(true,&stype);
elem->PutAttribute(1,cctype);
BinaryFile *file = new BinaryFile( 0 );
if( src.length() )
file->Decode(src);
elem->PutAttribute(2,file);
result.addr = elem;
return YIELD;
}
else
{
delete u;
u = 0;
}
}
//check if there is a link (a href) to load
//after the emb obj. are handelt
while( !exturl && pa && pa->currentLink < pa->numberOfLinks )
{
//check if the right host und check if the
//url is not loaded before with the hash.
//Also check of the function
bool hostOk = true;
URL *checkUrl = new URL((pa->p->getUrlHosts(pa->currentLink++,
*getAdvance->host,hostOk)));
cout << *checkUrl << endl;
if( checkUrl->IsDefined() && hostOk)
{
__TRACE__
string hashstring = checkUrl->getProtocol() + "://"
+ checkUrl->getHost() + checkUrl->getPath();
if(!getAdvance->myHash->IsDuplicate(hashstring))
{
cout << "Defined and host o.k. and not duplicate" << endl;
if( hasFunction )
{
ArgVectorPointer funargs = qp->Argument(args[4].addr);
(*funargs)[0] = SetWord(checkUrl);
Word funresult;
qp->Request(args[4].addr, funresult);
bool funerg;
if (((Attribute*)funresult.addr)->IsDefined())
{
funerg = ((CcBool*)funresult.addr)->GetBoolval();
}
else
funerg = false;
if( funerg)
{
exturl = checkUrl;
}
else
{
delete checkUrl;
checkUrl = NULL;
}
}
else
exturl = checkUrl;
}
else
{
delete checkUrl;
checkUrl = NULL;
}
}
else {
delete checkUrl;
checkUrl = NULL;
}
}
if( !exturl )
{
delete pa->p;
delete pa;
pa = 0;
getAdvance->myDepthStack->pop();
--getAdvance->depth;
if( !getAdvance->myDepthStack->empty() )
{
pa = getAdvance->myDepthStack->top();
}
}
}
if(getAdvance->isnew || exturl)
{
__TRACE__
//load the URL und make Page-Objekt if is HTML
//else return the loaded file
URL* u;
if(getAdvance->isnew)
{
__TRACE__
u = ((URL*)args[0].addr);
/*if( hasFunction )
{
ArgVectorPointer funargs = qp->Argument(args[4].addr);
(*funargs)[0] = args[0];
Word funresult;
qp->Request(args[4].addr, funresult);
bool funerg;
if (((Attribute*)funresult.addr)->IsDefined())
{
funerg = ((CcBool*)funresult.addr)->GetBoolval();
}
else
funerg = false;
if( !funerg)
{
return CANCEL;
}
}*/
string hashstring = u->getProtocol() + "://"
+ u->getHost() + u->getPath();
getAdvance->myHash->IsDuplicate(hashstring);
getAdvance->isnew = false;
exturl = new URL(*u);
}
u = exturl;
string type;// = "text/html";
bool isHtml = false;
DateTime dt;
cout << "load url from web" << endl;
string src = Page::getFromWeb(*u, type, isHtml, dt);
//cout << "ready loading url" << endl;
#ifdef _DEBUG_JPS_2
cout << "DEBUG_JPS_2" << src << "DEBUG_JPS_2 ends"<< endl;
#endif
Tuple *elem = new Tuple( getAdvance->resultTupleType );
elem->PutAttribute(0,u);
STRING_T stype;
strcpy(stype, type.c_str());
CcString* cctype = new CcString(true,&stype);
elem->PutAttribute(1,cctype);
if( !isHtml && (int)type.find(HTML::BasicType()) != -1)
isHtml = true;
cout << "isHTML: " << isHtml << ", " << type << endl;
BinaryFile *file;
if( isHtml )
{
file = new BinaryFile( src.length()+1 );
file->Put(0,src.length()+1,src.c_str());
}
else
{
file = new BinaryFile( 0 );
if( src.length() )
file->Decode(src);
}
elem->PutAttribute(2,file);
result.addr = elem;
if( isHtml)
{
__TRACE__
//make page object of the html data
//const char* s = 0;
//file->Get(0, &s);
//string str = s;
DateTime dt;
Page *p = new Page(*u, type, src, dt);
PageAdvance *pa = new PageAdvance();
pa->numberOfEmb = p->numOfFiles();
if( extLinks && (isUnlimited || getAdvance->depth < depth ))
pa->numberOfLinks = p->getNumberOfUrls();
else
pa->numberOfLinks = 0;
cout << "Links: " << pa->numberOfLinks << endl;
pa->currentEmb = 0;
pa->currentLink = 0;
pa->p = p;
++getAdvance->depth;
getAdvance->myDepthStack->push(pa);
}
return YIELD;
}
else
return CANCEL;
}
case CLOSE:
__TRACE__
{
getAdvance = ((GetAdvance*) local.addr);
delete getAdvance->myHash;
getAdvance->myHash = 0;
delete getAdvance->host;
getAdvance->host = 0;
while( !getAdvance->myDepthStack->empty())
{
PageAdvance *pa = getAdvance->myDepthStack->top();
if( pa->p)
delete pa->p;
delete pa;
pa = 0;
getAdvance->myDepthStack->pop();
}
delete getAdvance->myDepthStack;
getAdvance->myDepthStack = 0;
getAdvance->resultTupleType->DeleteIfAllowed();
delete getAdvance;
getAdvance = 0;
return 0;
}
}
/* should not happen */
return -1;
}
int
ISWebWgetFourParam (Word* args, Word& result, int message,
Word& local, Supplier s)
{
return wgetFun(args,result,message,local,s,false);
}
int
ISWebWgetFiveParam (Word* args, Word& result, int message,
Word& local, Supplier s)
{
return wgetFun(args,result,message,local,s,true);
}
/*
6.2.18 Value Mapping Function for Operator ~pageget, htmlget~
*/
int
pagegetFun (Word* args, Word& result, int message, Word& local, Supplier s,
bool hasFunction, bool onlyhtml)
{
//to check with map not ready
ListExpr resultType;
struct GetAdvance {stack<PageAdvance*>* myDepthStack;
HashUrl *myHash; TupleType *resultTupleType;
int depth; bool isnew; string *host;}* getAdvance;
__TRACE__
switch( message )
{
case OPEN:
{
__TRACE__
getAdvance = new GetAdvance;
getAdvance->myHash = new HashUrl;
getAdvance->myDepthStack = new stack<PageAdvance*>;
resultType = GetTupleResultType( s );
getAdvance->resultTupleType = new TupleType( nl->Second( resultType ) );
getAdvance->depth = 0;
getAdvance->isnew = true;
FText* t = ((FText*)args[3].addr);
URL* u = ((URL*)args[0].addr);
string s = t->Get();
if( s.length() > 0)
{
getAdvance->host = new string(u->getHost() + "," + t->Get());
}
else
{
getAdvance->host = new string(u->getHost());
}
local.addr = getAdvance;
}
return 0;
case REQUEST:
__TRACE__
{
getAdvance = ((GetAdvance*) local.addr);
PageAdvance *pa = NULL;
bool extLinks = StdTypes::GetBool(args[1]);
int depth = StdTypes::GetInt(args[2]);
bool isUnlimited = (depth < 0);
URL *exturl = NULL;
while( getAdvance->isnew || !getAdvance->myDepthStack->empty() )
{
__TRACE__
if( !getAdvance->myDepthStack->empty() )
pa = getAdvance->myDepthStack->top();
while( !exturl && pa)
{
__TRACE__
//check if there is a link (a href) to load
//after the emb obj. are handelt
while( !exturl && pa->currentLink < pa->numberOfLinks)
{
//check if the right host und check if the
//url is not loaded before with the hash.
//Also check of the function
bool hostOk = true;
URL *checkUrl = new URL((pa->p->getUrlHosts(pa->currentLink++,
*getAdvance->host,hostOk)));
//cout << *checkUrl << endl;
cout << ".";
if( checkUrl->IsDefined() && hostOk)
{
string hashstring = checkUrl->getProtocol() + "://"
+ checkUrl->getHost() + checkUrl->getPath();
if(!getAdvance->myHash->IsDuplicate(hashstring))
{
//cout << "Defined and host o.k. and not duplicate" << endl;
cout << hashstring << endl;
if( hasFunction )
{
ArgVectorPointer funargs = qp->Argument(args[4].addr);
(*funargs)[0] = SetWord(checkUrl);
Word funresult;
qp->Request(args[4].addr, funresult);
bool funerg;
if (((Attribute*)funresult.addr)->IsDefined())
{
funerg = ((CcBool*)funresult.addr)->GetBoolval();
}
else
funerg = false;
if( funerg)
{
exturl = checkUrl;
}
else
{
delete checkUrl;
checkUrl = NULL;
}
}
else
exturl = checkUrl;
}
else
{
delete checkUrl;
checkUrl = NULL;
}
}
else
{
delete checkUrl;
checkUrl = NULL;
}
}
if( !exturl )
{
delete pa->p;
delete pa;
pa = 0;
getAdvance->myDepthStack->pop();
--getAdvance->depth;
if( !getAdvance->myDepthStack->empty() )
{
pa = getAdvance->myDepthStack->top();
}
}
}
if(getAdvance->isnew || exturl)
{
__TRACE__
//load the URL und make Page-Objekt if is HTML
//else return the loaded file
URL* u;
if(getAdvance->isnew)
{
__TRACE__
u = ((URL*)args[0].addr);
/*if( hasFunction )
{
ArgVectorPointer funargs = qp->Argument(args[4].addr);
(*funargs)[0] = args[0];
Word funresult;
qp->Request(args[4].addr, funresult);
bool funerg;
if (((Attribute*)funresult.addr)->IsDefined())
{
funerg = ((CcBool*)funresult.addr)->GetBoolval();
}
else
funerg = false;
if( !funerg)
{
__TRACE__
return CANCEL;
}
}*/
string hashstring = u->getProtocol() + "://"
+ u->getHost() + u->getPath();
getAdvance->myHash->IsDuplicate(hashstring);
getAdvance->isnew = false;
exturl = new URL(*u);
cout << *u << endl;
}
u = exturl;
string type;// = "text/html";
bool isHtml = false;
DateTime dt(instanttype);
__TRACE__
cout << "load url from web" << endl;
string src = Page::getFromWeb(*u, type, isHtml, dt, true);
//cout << "ready loading url" << endl;
// __TRACE__
if( !isHtml && (int)type.find(HTML::BasicType()) != -1)
isHtml = true;
cout << "isHTML: " << isHtml << ", " << type << endl;
if( isHtml)
{
__TRACE__
//make page or html object depends on value onlyhtml
//of the html data
Page *p;
PageAdvance *pa = new PageAdvance();
if( onlyhtml )
{
HTML h(dt, src, *u);
p = new Page( h );
//cout << "Inhalt" << p->getContent() << endl;
pa->numberOfEmb = 0;
}
else
{
p = new Page(*u, type, src, dt);
pa->numberOfEmb = p->numOfFiles();
}
if( extLinks && (isUnlimited || getAdvance->depth < depth ))
pa->numberOfLinks = p->getNumberOfUrls();
else
pa->numberOfLinks = 0;
cout << "Links: " << pa->numberOfLinks << endl << endl;
pa->currentEmb = 0;
pa->currentLink = 0;
pa->p = p;
++getAdvance->depth;
getAdvance->myDepthStack->push(pa);
Tuple *elem = new Tuple( getAdvance->resultTupleType );
if( onlyhtml )
{
HTML *hh = (HTML*)p;
elem->PutAttribute(0,u);
elem->PutAttribute(1,new HTML(*hh));
}
else
{
elem->PutAttribute(0,u);
elem->PutAttribute(1,new Page(*p));
}
result.addr = elem;
return YIELD;
}
else
{
pa = NULL;
delete exturl;
exturl = NULL;
}
}
}
return CANCEL;
}
case CLOSE:
__TRACE__
{
getAdvance = ((GetAdvance*) local.addr);
delete getAdvance->myHash;
getAdvance->myHash = 0;
delete getAdvance->host;
getAdvance->host = 0;
while( !getAdvance->myDepthStack->empty())
{
PageAdvance *pa = getAdvance->myDepthStack->top();
if( pa->p)
pa->p->DeleteIfAllowed();
delete pa;
pa = 0;
getAdvance->myDepthStack->pop();
}
delete getAdvance->myDepthStack;
getAdvance->myDepthStack = 0;
getAdvance->resultTupleType->DeleteIfAllowed();
delete getAdvance;
getAdvance = 0;
return 0;
}
}
/* should not happen */
__TRACE__
return -1;
}
int
ISWebPagegetFourParam (Word* args, Word& result, int message,
Word& local, Supplier s)
{
return pagegetFun(args,result,message,local,s,false,false);
}
int
ISWebPagegetFiveParam (Word* args, Word& result, int message,
Word& local, Supplier s)
{
return pagegetFun(args,result,message,local,s,true,false);
}
int
ISWebHtmlgetFourParam (Word* args, Word& result, int message,
Word& local, Supplier s)
{
return pagegetFun(args,result,message,local,s,false,true);
}
int
ISWebHtmlgetFiveParam (Word* args, Word& result, int message,
Word& local, Supplier s)
{
return pagegetFun(args,result,message,local,s,true,true);
}
/*
6.2.19 Selection functions for Operator ~webequal~
*/
int webequalSelect( ListExpr args)
{
ListExpr arg1 = nl->First( args);
ListExpr arg2 = nl->Second( args);
if ( nl->IsEqual(arg1, URL::BasicType()) &&
nl->IsEqual(arg2, URL::BasicType()) )
return(0);
if ( nl->IsEqual(arg1, HTML::BasicType()) &&
nl->IsEqual(arg2, HTML::BasicType()) )
return(1);
if ( nl->IsEqual(arg1, Page::BasicType()) &&
nl->IsEqual(arg2, Page::BasicType()) )
return(2);
return(-1); //This point should never be reached
}
/*
6.2.20 Value Mapping Functions for Operators ~webequal~
*/
int
ISWebequalUrlFun (Word* args, Word& result, int message,
Word& local, Supplier s)
{
__TRACE__
URL* u1 = ((URL*)args[0].addr);
URL* u2 = ((URL*)args[1].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((CcBool*)result.addr)->Set(true, *u1 == *u2);
return 0;
}
int
ISWebequalHtmlFun (Word* args, Word& result, int message,
Word& local, Supplier s)
{
__TRACE__
HTML* h1 = ((HTML*)args[0].addr);
HTML* h2 = ((HTML*)args[1].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((CcBool*)result.addr)->Set(true, *h1 == *h2);
return 0;
}
int
ISWebequalPageFun (Word* args, Word& result, int message,
Word& local, Supplier s)
{
__TRACE__
Page* p1 = ((Page*)args[0].addr);
Page* p2 = ((Page*)args[1].addr);
result = qp->ResultStorage(s); //query processor has provided
//a result instance to take the result
((CcBool*)result.addr)->Set(true, *p1 == *p2);
return 0;
}
/*
6.2.21 Value Mapping Array for Operators ~webequal, wget, pageget,htmlget~
*/
ValueMapping webequalMap[] =
{ISWebequalUrlFun,ISWebequalHtmlFun,ISWebequalPageFun};
ValueMapping webwgetMap[] =
{ISWebWgetFourParam,ISWebWgetFiveParam};
ValueMapping webpagegetMap[] =
{ISWebPagegetFourParam,ISWebPagegetFiveParam};
ValueMapping webhtmlgetMap[] =
{ISWebHtmlgetFourParam,ISWebHtmlgetFiveParam};
/*
6.3 Specifications
6.3.1 Specification of Operator ~protocol~
*/
const string protocolSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(url) -> text</text--->"
"<text>protocol( url )</text--->"
"<text>Returns the protocol of the url</text--->"
"<text>protocol( url1 )</text--->"
") )";
/*
6.3.2 Specification of Operator ~host~
*/
const string hostSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(url) -> text</text--->"
"<text>host( url )</text--->"
"<text>Returns the host of the url</text--->"
"<text>host( url1 )</text--->"
") )";
/*
6.3.3 Specification of Operator ~filename~
*/
const string filenameSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(url) -> text</text--->"
"<text>filename( url )</text--->"
"<text>Returns the filename with path</text--->"
"<text>filename( url1 )</text--->"
") )";
/*
6.3.4 Specification of Operator ~source~
*/
const string sourceSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(html or page) -> url</text--->"
"<text>source( html/page )</text--->"
"<text>Returns the url of the html/page</text--->"
"<text>source( html1 )</text--->"
") )";
/*
6.3.5 Specification of Operator ~createurl~
*/
const string createurlSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(text) -> url</text--->"
"<text>createurl( text )</text--->"
"<text>Creates an url of the given text</text--->"
"<text>createurl(text.../text--- )</text--->"
") )";
/*
6.3.6 Specification of Operator ~content~
*/
const string contentSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(html) -> text</text--->"
"<text>content( html )</text--->"
"<text>Returns the content without tags</text--->"
"<text>content(html1)</text--->"
") )";
/*
6.3.7 Specification of Operator ~urls~
*/
const string urlsSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(html or page) -> stream(url)</text--->"
"<text>urls( html/page )</text--->"
"<text>Returns all urls of the given object</text--->"
"<text>urls(html1)</text--->"
") )";
/*
6.3.8 Specification of Operator ~containsurl~
*/
const string containsurlSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(html or page x url) -> bool</text--->"
"<text>containsurl( html/page, url )</text--->"
"<text>Checks if the given html contains the given url</text--->"
"<text>containsurl(html1,url1)</text--->"
") )";
/*
6.3.9 Specification of Operator ~lastmodified~
*/
const string lastmodifiedSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(html) -> instant</text--->"
"<text>lastmodified( html )</text--->"
"<text>Returns the last modified date of the given html</text--->"
"<text>lastmodified(html1)</text--->"
") )";
/*
6.3.10 Specification of Operator ~metainfo~
*/
const string metainfoSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(html x string) -> text</text--->"
"<text>metainfo( html, string )</text--->"
"<text>Returns the metainfo for the key or an empty string</text--->"
"<text>metainfo(html1, \"content\")</text--->"
") )";
/*
6.3.11 Specification of Operator ~metainfos~
*/
const string metainfosSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(html) -> stream(tuple([Key:string,Content:text]))</text--->"
"<text>metainfos( html )</text--->"
"<text>Returns all metainfos of the given html with key</text--->"
"<text>metainfos(html1)</text--->"
") )";
/*
6.3.12 Specification of Operator ~numberof~
*/
const string numberofSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(html x string)-> int</text--->"
"<text>numberof( html, string )</text--->"
"<text>counts the given string in the html</text--->"
"<text>numberof(html1,\"test\")</text--->"
") )";
/*
6.3.13 Specification of Operator ~similar~
*/
const string similarSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(html x html x int x bool) -> real</text--->"
"<text>similar( html,html,depth,follow order )</text--->"
"<text>calc.how similar the two htmls are to the given depth</text--->"
"<text>similar(html1,html2,0,true)</text--->"
") )";
/*
6.3.14 Specification of Operator ~extracthtml~
*/
const string extracthtmlSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>page -> html</text--->"
"<text>extracthtml( page )</text--->"
"<text>returns the html file of the given page</text--->"
"<text>extracthtml(page1)</text--->"
") )";
/*
6.3.15 Specification of Operator ~numoffiles~
*/
const string numoffilesSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>page -> int</text--->"
"<text>numoffiles( page )</text--->"
"<text>returns the number of the embedded objects</text--->"
"<text>numoffiles(page1)</text--->"
") )";
/*
6.3.16 Specification of Operator ~getfiles~
*/
const string getfilesSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>page -> stream(tuple([Source:url,"
" Type:string, File:binfile]))</text--->"
"<text>getfiles( page1 )</text--->"
"<text>returns a stream of tuples with all embedded files</text--->"
"<text>getfiles(page1)</text--->"
") )";
/*
6.3.16 Specification of Operator ~wget~
*/
const string wgetSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(url x bool x int x text x map:url->bool) ->"
" stream(tuple([Source:url, Type:string, File:binfile]))</text--->"
"<text>wget( url,extLinks,depth,hosts[,filterFkt] )</text--->"
"<text>loads the given url and dependent files to depth d</text--->"
"<text>wget(url1,TRUE,2, <text...</text...,\n"
"fun(u:url) host(u) contains \"www\") consume</text--->"
") )";
/*
6.3.16 Specification of Operator ~pageget~
*/
const string pagegetSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(url x bool x int x text x map:url->bool) ->"
" stream(tuple([Source:url, Page:page]))</text--->"
"<text>pageget( url,extLinks,depth,hosts[,filterFkt] )</text--->"
"<text>loads the given html-url and dependent html pages</text--->"
"<text>pageget(url1,TRUE,2, <text...</text...) consume</text--->"
") )";
/*
6.3.16 Specification of Operator ~htmlget~
*/
const string htmlgetSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>(url x bool x int x text x map:url->bool)"
" -> stream(tuple([Source:url, Html:html]))</text--->"
"<text>htmlget( url,extLinks,depth,hosts[,filterFkt] )</text--->"
"<text>loads the given html-url and dependent html pages</text--->"
"<text>htmlget(url1,TRUE,2, <text...</text...) consume</text--->"
") )";
/*
6.3.16 Specification of Operator ~webequal~
*/
const string webequalSpec = "( ( \"Signature\" \"Syntax\" \"Meaning\" "
"\"Example\" ) "
"( <text>t element of {url,html,page} ->t</text--->"
"<text>webequal( html1, html )</text--->"
"<text>returns true if the params equal else false</text--->"
"<text>webequal(html1, html2)</text--->"
") )";
/*
6.4 Definition of Operators
6.4.1 Definition of Operator ~protocol~
*/
Operator webprotocol (
"protocol", //name
protocolSpec, //specification
protocolFun, //value mapping
Operator::SimpleSelect, //trivial selection function
protocolHostFilenameTypeMap //type mapping
);
/*
6.4.2 Definition of Operator ~host~
*/
Operator webhost (
"host", //name
hostSpec, //specification
hostFun, //value mapping
Operator::SimpleSelect, //trivial selection function
protocolHostFilenameTypeMap //type mapping
);
/*
6.4.3 Definition of Operator ~filename~
*/
Operator webfilename (
"webfilename", //name
filenameSpec, //specification
filenameFun, //value mapping
Operator::SimpleSelect, //trivial selection function
protocolHostFilenameTypeMap //type mapping
);
/*
6.4.4 Definition of Operator ~source~
*/
Operator websource (
"source", //name
sourceSpec, //specification
sourceFun, //value mapping
Operator::SimpleSelect, //trivial selection function
sourceTypeMap //type mapping
);
/*
6.4.5 Definition of Operator ~createurl~
*/
Operator webcreateurl (
"createurl", //name
createurlSpec, //specification
createurlFun, //value mapping
Operator::SimpleSelect, //trivial selection function
createurlTypeMap //type mapping
);
/*
6.4.6 Definition of Operator ~content~
*/
Operator webcontent (
"content", //name
contentSpec, //specification
contentFun, //value mapping
Operator::SimpleSelect, //trivial selection function
contentTypeMap //type mapping
);
/*
6.4.7 Definition of Operator ~urls~
*/
Operator weburls (
"urls", //name
urlsSpec, //specification
urlsFun, //value mapping
Operator::SimpleSelect, //trivial selection function
urlsTypeMap //type mapping
);
/*
6.4.8 Definition of Operator ~containsurl~
*/
Operator webcontainsurl (
"containsurl", //name
containsurlSpec, //specification
containsurlFun, //value mapping
Operator::SimpleSelect, //trivial selection function
containsurlTypeMap //type mapping
);
/*
6.4.9 Definition of Operator ~lastmodified~
*/
Operator weblastmodified (
"lastmodified", //name
lastmodifiedSpec, //specification
lastmodifiedFun, //value mapping
Operator::SimpleSelect, //trivial selection function
lastmodifiedTypeMap //type mapping
);
/*
6.4.10 Definition of Operator ~metainfo~
*/
Operator webmetainfo (
"metainfo", //name
metainfoSpec, //specification
metainfoFun, //value mapping
Operator::SimpleSelect, //trivial selection function
metainfoTypeMap //type mapping
);
/*
6.4.11 Definition of Operator ~metainfos~
*/
Operator webmetainfos (
"metainfos", //name
metainfosSpec, //specification
metainfosFun, //value mapping
Operator::SimpleSelect, //trivial selection function
metainfosTypeMap //type mapping
);
/*
6.4.12 Definition of Operator ~numberof~
*/
Operator webnumberof (
"numberof", //name
numberofSpec, //specification
numberofFun, //value mapping
Operator::SimpleSelect, //trivial selection function
numberofTypeMap //type mapping
);
/*
6.4.13 Definition of Operator ~similar~
*/
Operator websimilar (
"similar", //name
similarSpec, //specification
similarFun, //value mapping
Operator::SimpleSelect, //trivial selection function
similarTypeMap //type mapping
);
/*
6.4.14 Definition of Operator ~extracthtml~
*/
Operator webextracthtml (
"extracthtml", //name
extracthtmlSpec, //specification
extracthtmlFun, //value mapping
Operator::SimpleSelect, //trivial selection function
extracthtmlTypeMap //type mapping
);
/*
6.4.15 Definition of Operator ~numoffiles~
*/
Operator webnumoffiles (
"numoffiles", //name
numoffilesSpec, //specification
numoffilesFun, //value mapping
Operator::SimpleSelect, //trivial selection function
numoffilesTypeMap //type mapping
);
/*
6.4.16 Definition of Operator ~getfiles~
*/
Operator webgetfiles (
"getfiles", //name
getfilesSpec, //specification
getfilesFun, //value mapping
Operator::SimpleSelect, //trivial selection function
getfilesTypeMap //type mapping
);
/*
6.4.17 Definition of Operator ~wget~
*/
Operator webwget (
"wget", //name
wgetSpec, //specification
2, //number of functions
webwgetMap, //value mapping
webwget_pagegetSelect, //trivial selection function
wgetTypeMap //type mapping
);
/*
6.4.18 Definition of Operator ~pageget~
*/
Operator webpageget (
"pageget", //name
pagegetSpec, //specification
2, //number of functions
webpagegetMap, //value mapping
webwget_pagegetSelect, //trivial selection function
pagegetTypeMap //type mapping
);
/*
6.4.18 Definition of Operator ~htmlget~
*/
Operator webhtmlget (
"htmlget", //name
htmlgetSpec, //specification
2, //number of functions
webhtmlgetMap, //value mapping
webwget_pagegetSelect, //trivial selection function
htmlgetTypeMap //type mapping
);
/*
6.4.19 Definition of Operator ~wegequal~
*/
Operator webequal (
"webequal", //name
webequalSpec, //specification
3, //number of functions
webequalMap, //value mapping
webequalSelect, //trivial selection function
webequalTypeMap //type mapping
);
/*
7. Algebra
*/
class WebAlgebra : public Algebra
{
public:
WebAlgebra() : Algebra()
{
AddTypeConstructor( &url );
url.AssociateKind(Kind::DATA());
AddTypeConstructor( &html );
html.AssociateKind(Kind::DATA());
AddTypeConstructor( &page );
page.AssociateKind(Kind::DATA());
AddOperator( &webprotocol );
AddOperator( &webhost );
AddOperator( &webfilename );
AddOperator( &websource );
AddOperator( &webcreateurl );
AddOperator( &webcontent );
AddOperator( &weburls );
AddOperator( &webcontainsurl );
AddOperator( &weblastmodified );
AddOperator( &webmetainfo );
AddOperator( &webmetainfos );
AddOperator( &webnumberof );
AddOperator( &websimilar );
AddOperator( &webextracthtml );
AddOperator( &webnumoffiles );
AddOperator( &webgetfiles );
AddOperator( &webwget );
AddOperator( &webpageget );
AddOperator( &webhtmlget );
AddOperator( &webequal );
}
~WebAlgebra() {};
};
/*
8. Initialization
Each algebra module needs an initialization function. The algebra manager
has a reference to this function if this algebra is included in the list
of required algebras, thus forcing the linker to include this module.
The algebra manager invokes this function to get a reference to the instance
of the algebra class and to provide references to the global nested list
container (used to store constructor, type, operator and object information)
and to the query processor.
The function has a C interface to make it possible to load the algebra
dynamically at runtime.
*/
extern "C"
Algebra*
InitializeWebAlgebra( NestedList* nlRef, QueryProcessor* qpRef )
{
nl = nlRef;
qp = qpRef;
return (new WebAlgebra());
}