Files
secondo/bin/Scripts/Pregel-kNN-GraphPart1.psec

221 lines
7.8 KiB
Plaintext
Raw Normal View History

2026-01-23 17:03:45 +08:00
/*
//paragraph [10] Title: [{\Large \bf ] [}]
//[@] [\verb+@+]
//[%] [\%]
//[&] [\&]
//[ue] [\"{u}]
//[=~] [$\approx$]
//[->] [$\rightarrow$]
[10] Approximate kNN-Graph Construction in Secondo-Pregel
Ralf Hartmut G[ue]ting, September 2020.
Part 1
This script is based on ideas from the paper:
Wei Dong, Moses Charikar, Kai Li: Efficient k-nearest neighbor graph construction for generic similarity measures. WWW 2011: 577-586
1 Development on the Master
1.1 Database and Workers
In this example, we use the database germany defined in the module secondo-data at secondo-data/Databases/germany
We use the ~Stadt~ relation with 1456 tuples and compute an approximate nearest neighbor graph.
*/
# restore database germany from '/home/ralf/secondo-data/Databases/germany'
open database germany
restore Workers from Workers6Pregel
let NWorkers = Workers count
/*
1.2 Spatial Partitioning
We partition cities spatially, imposing a 2 x 3 grid. The grid coordinates are read from the Hoese viewer after displaying the ~Stadt~ relation.
*/
let Box = [const rect value (5.4 15.5 47.0 55.0)]
let grid = [const cellgrid2d value (5.4 47.0 5.0 2.7 2)]
/*
The grid partitioning can be visualized by the following queries:
*/
query intstream(1, 6) transformstream extend[Cell:
gridcell2rect(.Elem, 5.4, 47.0, 5.0, 2.7, 2)] consume
query Stadt
/*
The result is shown in Figure 1.
Figure 1: Partitioning of relation ~Stadt~ [StadtPartition6b.eps]
Assign partitions to cities by the ~grid~:
*/
let NodesP = Stadt feed projectextend[; Id: .SNr,
Partition: cellnumber(bbox(.Ort), grid) transformstream head[1]
extract[Elem] - 1, Value: .Ort]
consume
let Size = NodesP feed max[Id] + 1;
query share("Size", TRUE, Workers)
/*
1.3 Creating a Random Initial Graph
From each node, we create ~k~ random edges.
*/
let k = 5;
query share("k", TRUE, Workers)
let wgs84 = create_geoid("WGS1984");
query share("wgs84", TRUE, Workers)
let EdgesP0 = NodesP feed
loopjoin[intstream(1, k) transformstream
projectextend[; Tid: int2tid(randint(Size))]
transformstream NodesP gettuples {t}]
extend[Distance: distance(.Value, .Value_t, wgs84)]
consume
let EdgesP = EdgesP0 feed
EdgesP0 feed projectextend[; Id: .Id_t, Partition: .Partition_t, Value: .Value_t,
Id_t: .Id, Partition_t: .Partition, Value_t: .Value, Distance: .Distance]
concat
consume
/*
1.4 Creating Main Memory Structures on the Master
We use a main memory graph of type ~mgraph3~ and a memory relation. The latter is indexed by a memory AVL-tree to be able to access nodes by indentifiers.
*/
let G = EdgesP feed createmgraph3[Id, Id_t, Distance, Size]
let Nodes = NodesP feed extend[Active: TRUE, Iterations: 0, SumD: k * 2000000.0,
SumDNew: k * 1800000.0] mconsume
let Nodes_Id = Nodes mcreateAVLtree[Id]
/*
1.5 Defining Initial Messages
*/
let InitialMessages = EdgesP feed
projectextend[; NodeId: .Id_t, NodePartition: .Partition_t,
Sender: .Id, SenderPartition: .Partition, Message: "m3", Value1: .Value,
Value2: 0.0]
consume;
query share("InitialMessages", TRUE, Workers)
let NoMessages = fun() InitialMessages feed head[0];
query share("NoMessages", TRUE, Workers)
/*
Schemas are:
----
NodesP(Id: int, Partition: int, Value: point)
EdgesP(Id: int, Partition: int, Value: point, Id_t: int, Partition_t: int,
Value_t: point, Distance: real)
Messages(NodeId: int, NodePartition: int, Sender: int, SenderPartition: int,
Message: string, Value1: point, Value2: real)
----
1.6 The Compute Function
*/
let Compute = fun (messages: stream(tuple([NodeId: int, NodePartition: int,
Sender: int, SenderPartition: int, Message: string, Value1: point,
Value2: real])))
messages mconsume
within[fun(msgs: ANY)
msgs mfeed extract[Message]
switch[
"m3",
msgs mfeed
loopsel[fun(m: TUPLE)
G mg3successors[attr(m, NodeId)]
projectextend[; NodeId: attr(m, Sender), NodePartition: attr(m,
SenderPartition),
Sender: .Id_t, SenderPartition: .Partition_t,
Message: "m5", Value1: .Value_t, Value2:
distance(attr(m, Value1), .Value_t, wgs84)]
G mg3successors[attr(m, NodeId)]
projectextend[; NodeId: .Id_t, NodePartition: .Partition_t,
Sender: attr(m, Sender), SenderPartition: attr(m,
SenderPartition),
Message: "m5", Value1: attr(m, Value1), Value2:
distance(attr(m, Value1), .Value_t, wgs84)]
concat
]
; "m5",
msgs mfeed
loopjoin[Nodes_Id Nodes mexactmatch[.NodeId] addid {n}]
filter[.Active_n]
sortby[NodeId]
groupby[NodeId, TID_n
; NInserted: group feed
projectextend[; Id: .NodeId, Partition: .NodePartition,
Value: .Value_n, Id_t: .Sender, Partition_t: .SenderPartition,
Value_t: .Value1, Distance: .Value2]
filter[.Id # .Id_t]
G mg3successors[group feed extract[NodeId], TRUE]
concat
sortby[Distance] rdup head[k] mg3insert[G] count]
Nodes mupdatedirect2[TID_n; Iterations: ..Iterations + 1,
SumDNew: G mg3successors[.NodeId] sum[Distance],
Active: ..SumDNew < ..SumD, SumD: ..SumDNew]
loopsel[fun(m2: TUPLE)
G mg3successors[attr(m2, Id)]
projectextend[; NodeId: .Id_t, NodePartition: .Partition_t,
Sender: .Id, SenderPartition: .Partition, Message: "m3",
Value1: .Value, Value2: 0.0]
]
; NoMessages()
]]
query share("Compute", TRUE, Workers)
/*
The initial messages are set up so that each node sends a message ~m3~ to its successors in the random graph.
On receiving ~m3~ messages, each node determines its own edges to successors. From successor positions it computes the distance to the node from which it received the ~m3~ message and sends it back to that node as an ~m5~ message. It also sends back successor positions so that the sender node can construct an edge from itself to the successor. Further, it sends a message to successors to create the corresponding directed edge from the successor to the sender (to have bi-directional or undirected edges).
On receiving a set of ~m5~ messages, each node determines the ~k~ edges with the smallest distance among the existing edges and those suggested by the ~m5~ messages. The ~k~ edges are replaced by this set.
In addition, a node observes the sum of distances to its successors. If this distance changed in this round (i.e., the set of edges changed), then the node is set to be further active, else not active. Only active nodes process ~m5~ messages.
In processing ~m5~ messages, nodes also create new ~m3~ messages for the next step.
Some technical comments:
* The function needs to read the stream of incoming messages twice, once to determine the type of mesages in this round (*extract*), and then to process this set of messages. The repeated reading of the stream does not work correctly in Secondo-Pregel at the moment, because read messages are deleted and will not appear again. This behaviour may be corrected in the near future.
At the moment, the problem is circumvented by storing the stream within the function in a main memory relation, using *mconsume* and the *within* operator. In this way the argument stream can be read repeatedly.
* The *mupdatedirect2* operator allows one to update tuples in a main memory relation using attribute values from the incoming tuple stream as well as from the stored tuple. The latter attributes are accessed by the ``..'' notation, as usual in Secondo for functions with two arguments (such as *symmjoin*, for example). The output from the operator is the stream of updated tuples.
2 Data Distribution
*/
let NodesSD = NodesP feed ddistribute2["", Partition, NWorkers, Workers]
makeSimple[FALSE, "NodesPersistent"]
let EdgesSD = EdgesP feed ddistribute2["", Partition, NWorkers, Workers]
makeSimple[FALSE, "EdgesPersistent"]