Files
secondo/bin/Scripts/CreateDistributedDblpPGraph.sec
2026-01-23 17:03:45 +08:00

202 lines
6.4 KiB
Plaintext

# Script to create a distributed property graph for the dblp dataset.
# Get the dblp data
# Within a directory, say /home/ralf/Daten, execute:
# wget https://dblp.uni-trier.de/xml/dblp.xml.gz
# gunzip dblp.xml.gz
# (Start monitors)
# Set up a database
create database dblpgraph0620;
open database dblpgraph0620;
let Document_raw = '/home/ralf/Daten/dblp.xml' dblpimport['/home/ralf/Daten/Stopwords.txt']
let Document_raw2 = Document_raw feed head[1000] addcounter[Docid, 1] consume
# create small template graph on master
{
let Document = Document_raw2 feed remove[AuthorsList, Keywords] consume
| let Keyword = Document_raw2 feed project[Keywords] unnest[Keywords] rduph[]
renameAttr[Word: Keyword]
addcounter[Wordid, 1] consume
| let HAS_KEYWORD = Document_raw2 feed project[Docid, Keywords] unnest[Keywords] {d}
Keyword feed itHashJoin[Keyword_d, Word]
projectextend[Wordid; Docid: .Docid_d]
project[Docid, Wordid]
consume
| let Author = Document_raw2 feed project[AuthorsList] unnest[AuthorsList] rduph[]
addcounter[Authorid, 1] consume
| let WROTE = Document_raw2 feed project[Docid, AuthorsList] unnest[AuthorsList] {d}
Author feed itHashJoin[Name_d, Name]
projectextend[Authorid; Docid: .Docid_d]
consume
| let Journals = Document_raw2 feed project[Journal] filter[.Journal # ""]
renameAttr[Name: Journal]
rduph[]
addcounter[Journalid, 1] consume
| let AT_JOURNAL = Document_raw2 feed filter[.Type = "article"] {d} project[Docid_d, Journal_d]
Journals feed itHashJoin[Journal_d, Name]
renameAttr[Docid: Docid_d]
project[Docid, Journalid]
consume
| let Conference = Document_raw2 feed filter[.Type = "inproceedings"] project[Booktitle]
renameAttr[Name: Booktitle]
rduph[]
addcounter[Conferenceid, 1] consume
| let AT_CONF = Document_raw2 feed {d} project[Docid_d, Booktitle_d]
Conference feed itHashJoin[Booktitle_d, Name]
renameAttr[Docid: Docid_d]
project[Docid, Conferenceid]
consume
| let Publisher = Document_raw2 feed filter[isdefined(.Publisher)] project[Publisher]
renameAttr[Name: Publisher]
rduph[]
addcounter[Publisherid, 1] consume
| let PUBLISHED_BY = Document_raw2 feed {d} project[Docid_d, Publisher_d]
Publisher feed itHashJoin[Publisher_d, Name]
renameAttr[Docid: Docid_d]
project[Docid, Publisherid]
consume
}
{
let dblp =createpgraph("dblp")
|
query dblp cfg["log","10"]|
query dblp cfg["dotquery","1"]
|
query dblp addnodesrel["Document", "Docid"]|
query dblp addnodesrel["Author", "Authorid"]|
query dblp addnodesrel["Keyword", "Wordid"]|
query dblp addnodesrel["Journals", "Journalid"]|
query dblp addnodesrel["Conference", "Conferenceid"]|
query dblp addnodesrel["Publisher", "Publisherid"]
|
query dblp addedgesrel["AT_CONF", "Docid=Document.Docid", "Conferenceid=Conference.Conferenceid"] |
query dblp addedgesrel["AT_JOURNAL", "Docid=Document.Docid", "Journalid=Journals.Journalid"] |
query dblp addedgesrel["HAS_KEYWORD", "Docid=Document.Docid", "Wordid=Keyword.Wordid"] |
query dblp addedgesrel["PUBLISHED_BY", "Docid=Document.Docid", "Publisherid=Publisher.Publisherid"] |
query dblp addedgesrel["WROTE","Authorid=Author.Authorid","Docid=Document.Docid"]
|
query dblp addindex["Author","Name"] |
query dblp addindex["Keyword","Word"] |
query dblp addindex["Journals","Name"] |
query dblp addindex["Conference","Name"]
|
query dblp info
}
# create distributed database
restore Workers from WorkersNewton
let NWorkers = Workers count
query share("dblp", TRUE, Workers)
# Distribute data to the workers by Docid. Then convert darray to a simple darray (sdarray). In a simple darray, all fields on workers have the same names, for example, "Document" instead of "Document_28".
let DocumentB1 = Document_raw feed addcounter[Docid, 1] ddistribute2["DocumentB1", Docid, NWorkers, Workers]
let DocumentB2 = DocumentB1 makeSimple[FALSE, "DocumentB2"];
delete DocumentB1
# Create relations
# In such a command, do not use quotes!
query DocumentB2 dcommand['{
let Document = DocumentB2 feed remove[AuthorsList, Keywords] consume
| let Keyword = DocumentB2 feed project[Keywords] unnest[Keywords] rduph[]
renameAttr[Word: Keyword]
addcounter[Wordid, 1] consume
| let HAS_KEYWORD = DocumentB2 feed project[Docid, Keywords] unnest[Keywords] {d}
Keyword feed itHashJoin[Keyword_d, Word]
projectextend[Wordid; Docid: .Docid_d]
project[Docid, Wordid]
consume
| let Author = DocumentB2 feed project[AuthorsList] unnest[AuthorsList] rduph[]
addcounter[Authorid, 1] consume
| let WROTE = DocumentB2 feed project[Docid, AuthorsList] unnest[AuthorsList] {d}
Author feed itHashJoin[Name_d, Name]
projectextend[Authorid; Docid: .Docid_d]
consume
| let Journals = DocumentB2 feed project[Journal] filter[.Journal # ""]
renameAttr[Name: Journal]
rduph[]
addcounter[Journalid, 1] consume
| let AT_JOURNAL = DocumentB2 feed filter[.Type = "article"] {d} project[Docid_d, Journal_d]
Journals feed itHashJoin[Journal_d, Name]
renameAttr[Docid: Docid_d]
project[Docid, Journalid]
consume
| let Conference = DocumentB2 feed filter[.Type = "inproceedings"] project[Booktitle]
renameAttr[Name: Booktitle]
rduph[]
addcounter[Conferenceid, 1] consume
| let AT_CONF = DocumentB2 feed {d} project[Docid_d, Booktitle_d]
Conference feed itHashJoin[Booktitle_d, Name]
renameAttr[Docid: Docid_d]
project[Docid, Conferenceid]
consume
| let Publisher = DocumentB2 feed filter[isdefined(.Publisher)] project[Publisher]
renameAttr[Name: Publisher]
rduph[]
addcounter[Publisherid, 1] consume
| let PUBLISHED_BY = DocumentB2 feed {d} project[Docid_d, Publisher_d]
Publisher feed itHashJoin[Publisher_d, Name]
renameAttr[Docid: Docid_d]
project[Docid, Publisherid]
consume
}']
consume
let Control = createintdarray("Control", Workers, NWorkers)
# Befory querying, the graph must be loaded into memory on master and workers
query dblp loadgraph
query Control dcommand['query memclear()'] filter[.Ok] count;
query Control dcommand['query meminit(3600)'] filter[.Ok] count;
query Control dcommand['query dblp loadgraph'] filter[.Ok] count
# 59 seconds
# Example query
query dblp match3['
MATCH
(doc)-[HAS_KEYWORD]->(w {Word: "Indoor"})
RETURN
doc.Authors, doc.Title, doc.Year
'] consume
query Control dmap["", dblp match3['
MATCH
(doc)-[HAS_KEYWORD]->(w {Word: "Symbolic"}), (doc)-[HAS_KEYWORD]->(w2 {Word: "Trajectory"})
RETURN
doc.Authors, doc.Title, doc.Year
']]
dsummarize consume