202 lines
6.4 KiB
Plaintext
202 lines
6.4 KiB
Plaintext
|
|
# Script to create a distributed property graph for the dblp dataset.
|
||
|
|
|
||
|
|
# Get the dblp data
|
||
|
|
|
||
|
|
# Within a directory, say /home/ralf/Daten, execute:
|
||
|
|
|
||
|
|
# wget https://dblp.uni-trier.de/xml/dblp.xml.gz
|
||
|
|
# gunzip dblp.xml.gz
|
||
|
|
|
||
|
|
|
||
|
|
# (Start monitors)
|
||
|
|
|
||
|
|
# Set up a database
|
||
|
|
|
||
|
|
create database dblpgraph0620;
|
||
|
|
|
||
|
|
open database dblpgraph0620;
|
||
|
|
|
||
|
|
let Document_raw = '/home/ralf/Daten/dblp.xml' dblpimport['/home/ralf/Daten/Stopwords.txt']
|
||
|
|
|
||
|
|
let Document_raw2 = Document_raw feed head[1000] addcounter[Docid, 1] consume
|
||
|
|
|
||
|
|
# create small template graph on master
|
||
|
|
|
||
|
|
{
|
||
|
|
let Document = Document_raw2 feed remove[AuthorsList, Keywords] consume
|
||
|
|
| let Keyword = Document_raw2 feed project[Keywords] unnest[Keywords] rduph[]
|
||
|
|
renameAttr[Word: Keyword]
|
||
|
|
addcounter[Wordid, 1] consume
|
||
|
|
| let HAS_KEYWORD = Document_raw2 feed project[Docid, Keywords] unnest[Keywords] {d}
|
||
|
|
Keyword feed itHashJoin[Keyword_d, Word]
|
||
|
|
projectextend[Wordid; Docid: .Docid_d]
|
||
|
|
project[Docid, Wordid]
|
||
|
|
consume
|
||
|
|
| let Author = Document_raw2 feed project[AuthorsList] unnest[AuthorsList] rduph[]
|
||
|
|
addcounter[Authorid, 1] consume
|
||
|
|
| let WROTE = Document_raw2 feed project[Docid, AuthorsList] unnest[AuthorsList] {d}
|
||
|
|
Author feed itHashJoin[Name_d, Name]
|
||
|
|
projectextend[Authorid; Docid: .Docid_d]
|
||
|
|
consume
|
||
|
|
| let Journals = Document_raw2 feed project[Journal] filter[.Journal # ""]
|
||
|
|
renameAttr[Name: Journal]
|
||
|
|
rduph[]
|
||
|
|
addcounter[Journalid, 1] consume
|
||
|
|
| let AT_JOURNAL = Document_raw2 feed filter[.Type = "article"] {d} project[Docid_d, Journal_d]
|
||
|
|
Journals feed itHashJoin[Journal_d, Name]
|
||
|
|
renameAttr[Docid: Docid_d]
|
||
|
|
project[Docid, Journalid]
|
||
|
|
consume
|
||
|
|
| let Conference = Document_raw2 feed filter[.Type = "inproceedings"] project[Booktitle]
|
||
|
|
renameAttr[Name: Booktitle]
|
||
|
|
rduph[]
|
||
|
|
addcounter[Conferenceid, 1] consume
|
||
|
|
| let AT_CONF = Document_raw2 feed {d} project[Docid_d, Booktitle_d]
|
||
|
|
Conference feed itHashJoin[Booktitle_d, Name]
|
||
|
|
renameAttr[Docid: Docid_d]
|
||
|
|
project[Docid, Conferenceid]
|
||
|
|
consume
|
||
|
|
| let Publisher = Document_raw2 feed filter[isdefined(.Publisher)] project[Publisher]
|
||
|
|
renameAttr[Name: Publisher]
|
||
|
|
rduph[]
|
||
|
|
addcounter[Publisherid, 1] consume
|
||
|
|
| let PUBLISHED_BY = Document_raw2 feed {d} project[Docid_d, Publisher_d]
|
||
|
|
Publisher feed itHashJoin[Publisher_d, Name]
|
||
|
|
renameAttr[Docid: Docid_d]
|
||
|
|
project[Docid, Publisherid]
|
||
|
|
consume
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
{
|
||
|
|
let dblp =createpgraph("dblp")
|
||
|
|
|
|
||
|
|
query dblp cfg["log","10"]|
|
||
|
|
query dblp cfg["dotquery","1"]
|
||
|
|
|
|
||
|
|
query dblp addnodesrel["Document", "Docid"]|
|
||
|
|
query dblp addnodesrel["Author", "Authorid"]|
|
||
|
|
query dblp addnodesrel["Keyword", "Wordid"]|
|
||
|
|
query dblp addnodesrel["Journals", "Journalid"]|
|
||
|
|
query dblp addnodesrel["Conference", "Conferenceid"]|
|
||
|
|
query dblp addnodesrel["Publisher", "Publisherid"]
|
||
|
|
|
|
||
|
|
query dblp addedgesrel["AT_CONF", "Docid=Document.Docid", "Conferenceid=Conference.Conferenceid"] |
|
||
|
|
query dblp addedgesrel["AT_JOURNAL", "Docid=Document.Docid", "Journalid=Journals.Journalid"] |
|
||
|
|
query dblp addedgesrel["HAS_KEYWORD", "Docid=Document.Docid", "Wordid=Keyword.Wordid"] |
|
||
|
|
query dblp addedgesrel["PUBLISHED_BY", "Docid=Document.Docid", "Publisherid=Publisher.Publisherid"] |
|
||
|
|
query dblp addedgesrel["WROTE","Authorid=Author.Authorid","Docid=Document.Docid"]
|
||
|
|
|
|
||
|
|
query dblp addindex["Author","Name"] |
|
||
|
|
query dblp addindex["Keyword","Word"] |
|
||
|
|
query dblp addindex["Journals","Name"] |
|
||
|
|
query dblp addindex["Conference","Name"]
|
||
|
|
|
|
||
|
|
query dblp info
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
# create distributed database
|
||
|
|
|
||
|
|
restore Workers from WorkersNewton
|
||
|
|
|
||
|
|
let NWorkers = Workers count
|
||
|
|
|
||
|
|
query share("dblp", TRUE, Workers)
|
||
|
|
|
||
|
|
# Distribute data to the workers by Docid. Then convert darray to a simple darray (sdarray). In a simple darray, all fields on workers have the same names, for example, "Document" instead of "Document_28".
|
||
|
|
|
||
|
|
let DocumentB1 = Document_raw feed addcounter[Docid, 1] ddistribute2["DocumentB1", Docid, NWorkers, Workers]
|
||
|
|
|
||
|
|
let DocumentB2 = DocumentB1 makeSimple[FALSE, "DocumentB2"];
|
||
|
|
delete DocumentB1
|
||
|
|
|
||
|
|
|
||
|
|
# Create relations
|
||
|
|
|
||
|
|
# In such a command, do not use quotes!
|
||
|
|
|
||
|
|
query DocumentB2 dcommand['{
|
||
|
|
let Document = DocumentB2 feed remove[AuthorsList, Keywords] consume
|
||
|
|
| let Keyword = DocumentB2 feed project[Keywords] unnest[Keywords] rduph[]
|
||
|
|
renameAttr[Word: Keyword]
|
||
|
|
addcounter[Wordid, 1] consume
|
||
|
|
| let HAS_KEYWORD = DocumentB2 feed project[Docid, Keywords] unnest[Keywords] {d}
|
||
|
|
Keyword feed itHashJoin[Keyword_d, Word]
|
||
|
|
projectextend[Wordid; Docid: .Docid_d]
|
||
|
|
project[Docid, Wordid]
|
||
|
|
consume
|
||
|
|
| let Author = DocumentB2 feed project[AuthorsList] unnest[AuthorsList] rduph[]
|
||
|
|
addcounter[Authorid, 1] consume
|
||
|
|
| let WROTE = DocumentB2 feed project[Docid, AuthorsList] unnest[AuthorsList] {d}
|
||
|
|
Author feed itHashJoin[Name_d, Name]
|
||
|
|
projectextend[Authorid; Docid: .Docid_d]
|
||
|
|
consume
|
||
|
|
| let Journals = DocumentB2 feed project[Journal] filter[.Journal # ""]
|
||
|
|
renameAttr[Name: Journal]
|
||
|
|
rduph[]
|
||
|
|
addcounter[Journalid, 1] consume
|
||
|
|
| let AT_JOURNAL = DocumentB2 feed filter[.Type = "article"] {d} project[Docid_d, Journal_d]
|
||
|
|
Journals feed itHashJoin[Journal_d, Name]
|
||
|
|
renameAttr[Docid: Docid_d]
|
||
|
|
project[Docid, Journalid]
|
||
|
|
consume
|
||
|
|
| let Conference = DocumentB2 feed filter[.Type = "inproceedings"] project[Booktitle]
|
||
|
|
renameAttr[Name: Booktitle]
|
||
|
|
rduph[]
|
||
|
|
addcounter[Conferenceid, 1] consume
|
||
|
|
| let AT_CONF = DocumentB2 feed {d} project[Docid_d, Booktitle_d]
|
||
|
|
Conference feed itHashJoin[Booktitle_d, Name]
|
||
|
|
renameAttr[Docid: Docid_d]
|
||
|
|
project[Docid, Conferenceid]
|
||
|
|
consume
|
||
|
|
| let Publisher = DocumentB2 feed filter[isdefined(.Publisher)] project[Publisher]
|
||
|
|
renameAttr[Name: Publisher]
|
||
|
|
rduph[]
|
||
|
|
addcounter[Publisherid, 1] consume
|
||
|
|
| let PUBLISHED_BY = DocumentB2 feed {d} project[Docid_d, Publisher_d]
|
||
|
|
Publisher feed itHashJoin[Publisher_d, Name]
|
||
|
|
renameAttr[Docid: Docid_d]
|
||
|
|
project[Docid, Publisherid]
|
||
|
|
consume
|
||
|
|
}']
|
||
|
|
consume
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
let Control = createintdarray("Control", Workers, NWorkers)
|
||
|
|
|
||
|
|
# Befory querying, the graph must be loaded into memory on master and workers
|
||
|
|
|
||
|
|
query dblp loadgraph
|
||
|
|
|
||
|
|
query Control dcommand['query memclear()'] filter[.Ok] count;
|
||
|
|
query Control dcommand['query meminit(3600)'] filter[.Ok] count;
|
||
|
|
query Control dcommand['query dblp loadgraph'] filter[.Ok] count
|
||
|
|
|
||
|
|
# 59 seconds
|
||
|
|
|
||
|
|
# Example query
|
||
|
|
|
||
|
|
query dblp match3['
|
||
|
|
MATCH
|
||
|
|
(doc)-[HAS_KEYWORD]->(w {Word: "Indoor"})
|
||
|
|
RETURN
|
||
|
|
doc.Authors, doc.Title, doc.Year
|
||
|
|
'] consume
|
||
|
|
|
||
|
|
|
||
|
|
query Control dmap["", dblp match3['
|
||
|
|
MATCH
|
||
|
|
(doc)-[HAS_KEYWORD]->(w {Word: "Symbolic"}), (doc)-[HAS_KEYWORD]->(w2 {Word: "Trajectory"})
|
||
|
|
RETURN
|
||
|
|
doc.Authors, doc.Title, doc.Year
|
||
|
|
']]
|
||
|
|
dsummarize consume
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|