secondo/bin/Scripts/Pregel-kNN-GraphPart1.psec

/*
//paragraph [10] Title: [{\Large \bf ]  [}]
//[@] [\verb+@+]
//[%] [\%]
//[&] [\&]
//[ue] [\"{u}]
//[=~] [$\approx$]
//[->] [$\rightarrow$]

[10] Approximate kNN-Graph Construction in Secondo-Pregel

Ralf Hartmut G[ue]ting, September 2020.

Part 1

This script is based on ideas from the paper:

Wei Dong, Moses Charikar, Kai Li: Efficient k-nearest neighbor graph construction for generic similarity measures. WWW 2011: 577-586

1 Development on the Master

1.1 Database and Workers

In this example, we use the database germany defined in the module secondo-data at secondo-data/Databases/germany

We use the ~Stadt~ relation with 1456 tuples and compute an approximate nearest neighbor graph.

*/
# restore database germany from '/home/ralf/secondo-data/Databases/germany'

open database germany

restore Workers from Workers6Pregel

let NWorkers = Workers count

/*
1.2 Spatial Partitioning

We partition cities spatially, imposing a 2 x 3 grid. The grid coordinates are read from the Hoese viewer after displaying the ~Stadt~ relation.

*/

let Box = [const rect value (5.4 15.5 47.0 55.0)]

let grid = [const cellgrid2d value (5.4 47.0 5.0 2.7 2)]

/*
The grid partitioning can be visualized by the following queries:

*/
query intstream(1, 6) transformstream extend[Cell: 
  gridcell2rect(.Elem, 5.4, 47.0, 5.0, 2.7, 2)] consume

query Stadt

/*
The result is shown in Figure 1.

		Figure 1: Partitioning of relation ~Stadt~ [StadtPartition6b.eps]

Assign partitions to cities by the ~grid~:

*/
let NodesP = Stadt feed projectextend[; Id: .SNr, 
  Partition: cellnumber(bbox(.Ort), grid) transformstream head[1]
    extract[Elem] - 1, Value: .Ort]
  consume

let Size = NodesP feed max[Id] + 1;
query share("Size", TRUE, Workers)

/*
1.3 Creating a Random Initial Graph

From each node, we create ~k~ random edges.

*/
let k = 5;
query share("k", TRUE, Workers)

let wgs84 = create_geoid("WGS1984");
query share("wgs84", TRUE, Workers)

let EdgesP0 = NodesP feed 
  loopjoin[intstream(1, k) transformstream 
    projectextend[; Tid: int2tid(randint(Size))] 
    transformstream NodesP gettuples {t}]
    extend[Distance: distance(.Value, .Value_t, wgs84)]
  consume

let EdgesP = EdgesP0 feed
  EdgesP0 feed projectextend[; Id: .Id_t, Partition: .Partition_t, Value: .Value_t,
    Id_t: .Id, Partition_t: .Partition, Value_t: .Value, Distance: .Distance]
  concat
  consume

/*
1.4 Creating Main Memory Structures on the Master

We use a main memory graph of type ~mgraph3~ and a memory relation. The latter is indexed by a memory AVL-tree to be able to access nodes by indentifiers.

*/
let G = EdgesP feed createmgraph3[Id, Id_t, Distance, Size]

let Nodes = NodesP feed extend[Active: TRUE, Iterations: 0, SumD: k * 2000000.0, 
  SumDNew: k * 1800000.0] mconsume

let Nodes_Id = Nodes mcreateAVLtree[Id]

/*
1.5 Defining Initial Messages

*/
let InitialMessages = EdgesP feed
  projectextend[; NodeId: .Id_t, NodePartition: .Partition_t,
    Sender: .Id, SenderPartition: .Partition, Message: "m3", Value1: .Value, 
    Value2: 0.0]
  consume;
query share("InitialMessages", TRUE, Workers)

let NoMessages = fun() InitialMessages feed head[0];
query share("NoMessages", TRUE, Workers)

/*
Schemas are:

----
NodesP(Id: int, Partition: int, Value: point)
EdgesP(Id: int, Partition: int, Value: point, Id_t: int, Partition_t: int, 
  Value_t: point, Distance: real)

Messages(NodeId: int, NodePartition: int, Sender: int, SenderPartition: int, 
  Message: string, Value1: point, Value2: real)
----

1.6 The Compute Function

*/
let Compute = fun (messages: stream(tuple([NodeId: int, NodePartition: int, 
    Sender: int, SenderPartition: int, Message: string, Value1: point, 
    Value2: real])))
  messages mconsume
  within[fun(msgs: ANY)
    msgs mfeed extract[Message]
  switch[
    "m3",
      msgs mfeed
        loopsel[fun(m: TUPLE) 
          G mg3successors[attr(m, NodeId)] 
          projectextend[; NodeId: attr(m, Sender), NodePartition: attr(m,
            SenderPartition), 
            Sender: .Id_t, SenderPartition: .Partition_t, 
            Message: "m5", Value1: .Value_t, Value2: 
              distance(attr(m, Value1), .Value_t, wgs84)]
          G mg3successors[attr(m, NodeId)] 
          projectextend[; NodeId: .Id_t, NodePartition: .Partition_t, 
            Sender: attr(m, Sender), SenderPartition: attr(m,
            SenderPartition), 
            Message: "m5", Value1: attr(m, Value1), Value2: 
              distance(attr(m, Value1), .Value_t, wgs84)]
          concat
        ]
    ; "m5",
      msgs mfeed
        loopjoin[Nodes_Id Nodes mexactmatch[.NodeId] addid {n}]
        filter[.Active_n]
        sortby[NodeId]
        groupby[NodeId, TID_n
        ; NInserted: group feed 
            projectextend[; Id: .NodeId, Partition: .NodePartition,
              Value: .Value_n, Id_t: .Sender, Partition_t: .SenderPartition, 
              Value_t: .Value1, Distance: .Value2]
            filter[.Id # .Id_t]
          G mg3successors[group feed extract[NodeId], TRUE]
          concat
          sortby[Distance] rdup head[k] mg3insert[G] count]
        Nodes mupdatedirect2[TID_n; Iterations: ..Iterations + 1,
          SumDNew: G mg3successors[.NodeId] sum[Distance], 
          Active: ..SumDNew < ..SumD, SumD: ..SumDNew]
        loopsel[fun(m2: TUPLE) 
          G mg3successors[attr(m2, Id)]
          projectextend[; NodeId: .Id_t, NodePartition: .Partition_t,
          Sender: .Id, SenderPartition: .Partition, Message: "m3", 
          Value1: .Value, Value2: 0.0]
        ]
    ; NoMessages()
  ]]

query share("Compute", TRUE, Workers)

/*
The initial messages are set up so that each node sends a message ~m3~ to its successors in the random graph.

On receiving ~m3~ messages, each node determines its own edges to successors. From successor positions it computes the distance to the node from which it received the ~m3~ message and sends it back to that node as an ~m5~ message. It also sends back successor positions so that the sender node can construct an edge from itself to the successor. Further, it sends a message to successors to create the corresponding directed edge from the successor to the sender (to have bi-directional or undirected edges).

On receiving a set of ~m5~ messages, each node determines the ~k~ edges with the smallest distance among the existing edges and those suggested by the ~m5~ messages. The ~k~ edges are replaced by this set.

In addition, a node observes the sum of distances to its successors. If this distance changed in this round (i.e., the set of edges changed), then the node is set to be further active, else not active. Only active nodes process ~m5~ messages.

In processing ~m5~ messages, nodes also create new ~m3~ messages for the next step.

Some technical comments:

  * The function needs to read the stream of incoming messages twice, once to determine the type of mesages in this round (*extract*), and then to process this set of messages. The repeated reading of the stream does not work correctly in Secondo-Pregel at the moment, because read messages are deleted and will not appear again. This behaviour may be corrected in the near future.

    At the moment, the problem is circumvented by storing the stream within the function in a main memory relation, using *mconsume* and the *within* operator. In this way the argument stream can be read repeatedly.

  * The *mupdatedirect2* operator allows one to update tuples in a main memory relation using attribute values from the incoming tuple stream as well as from the stored tuple. The latter attributes are accessed by the ``..'' notation, as usual in Secondo for functions with two arguments (such as *symmjoin*, for example). The output from the operator is the stream of updated tuples.

2 Data Distribution

*/
let NodesSD = NodesP feed ddistribute2["", Partition, NWorkers, Workers]
  makeSimple[FALSE, "NodesPersistent"]

let EdgesSD = EdgesP feed ddistribute2["", Partition, NWorkers, Workers]
  makeSimple[FALSE, "EdgesPersistent"]
firs commit 2026-01-23 17:03:45 +08:00			`/*`
			`//paragraph [10] Title: [{\Large \bf ] [}]`
			`//[@] [\verb+@+]`
			`//[%] [\%]`
			`//[&] [\&]`
			`//[ue] [\"{u}]`
			`//[=~] [$\approx$]`
			`//[->] [$\rightarrow$]`

			`[10] Approximate kNN-Graph Construction in Secondo-Pregel`

			`Ralf Hartmut G[ue]ting, September 2020.`

			`Part 1`

			`This script is based on ideas from the paper:`

			`Wei Dong, Moses Charikar, Kai Li: Efficient k-nearest neighbor graph construction for generic similarity measures. WWW 2011: 577-586`

			`1 Development on the Master`

			`1.1 Database and Workers`

			`In this example, we use the database germany defined in the module secondo-data at secondo-data/Databases/germany`

			`We use the ~Stadt~ relation with 1456 tuples and compute an approximate nearest neighbor graph.`

			`*/`
			`# restore database germany from '/home/ralf/secondo-data/Databases/germany'`

			`open database germany`

			`restore Workers from Workers6Pregel`

			`let NWorkers = Workers count`

			`/*`
			`1.2 Spatial Partitioning`

			`We partition cities spatially, imposing a 2 x 3 grid. The grid coordinates are read from the Hoese viewer after displaying the ~Stadt~ relation.`

			`*/`

			`let Box = [const rect value (5.4 15.5 47.0 55.0)]`

			`let grid = [const cellgrid2d value (5.4 47.0 5.0 2.7 2)]`

			`/*`
			`The grid partitioning can be visualized by the following queries:`

			`*/`
			`query intstream(1, 6) transformstream extend[Cell:`
			`gridcell2rect(.Elem, 5.4, 47.0, 5.0, 2.7, 2)] consume`

			`query Stadt`

			`/*`
			`The result is shown in Figure 1.`

			`Figure 1: Partitioning of relation ~Stadt~ [StadtPartition6b.eps]`

			`Assign partitions to cities by the ~grid~:`

			`*/`
			`let NodesP = Stadt feed projectextend[; Id: .SNr,`
			`Partition: cellnumber(bbox(.Ort), grid) transformstream head[1]`
			`extract[Elem] - 1, Value: .Ort]`
			`consume`

			`let Size = NodesP feed max[Id] + 1;`
			`query share("Size", TRUE, Workers)`

			`/*`
			`1.3 Creating a Random Initial Graph`

			`From each node, we create ~k~ random edges.`

			`*/`
			`let k = 5;`
			`query share("k", TRUE, Workers)`

			`let wgs84 = create_geoid("WGS1984");`
			`query share("wgs84", TRUE, Workers)`

			`let EdgesP0 = NodesP feed`
			`loopjoin[intstream(1, k) transformstream`
			`projectextend[; Tid: int2tid(randint(Size))]`
			`transformstream NodesP gettuples {t}]`
			`extend[Distance: distance(.Value, .Value_t, wgs84)]`
			`consume`

			`let EdgesP = EdgesP0 feed`
			`EdgesP0 feed projectextend[; Id: .Id_t, Partition: .Partition_t, Value: .Value_t,`
			`Id_t: .Id, Partition_t: .Partition, Value_t: .Value, Distance: .Distance]`
			`concat`
			`consume`

			`/*`
			`1.4 Creating Main Memory Structures on the Master`

			`We use a main memory graph of type ~mgraph3~ and a memory relation. The latter is indexed by a memory AVL-tree to be able to access nodes by indentifiers.`

			`*/`
			`let G = EdgesP feed createmgraph3[Id, Id_t, Distance, Size]`

			`let Nodes = NodesP feed extend[Active: TRUE, Iterations: 0, SumD: k * 2000000.0,`
			`SumDNew: k * 1800000.0] mconsume`

			`let Nodes_Id = Nodes mcreateAVLtree[Id]`

			`/*`
			`1.5 Defining Initial Messages`

			`*/`
			`let InitialMessages = EdgesP feed`
			`projectextend[; NodeId: .Id_t, NodePartition: .Partition_t,`
			`Sender: .Id, SenderPartition: .Partition, Message: "m3", Value1: .Value,`
			`Value2: 0.0]`
			`consume;`
			`query share("InitialMessages", TRUE, Workers)`

			`let NoMessages = fun() InitialMessages feed head[0];`
			`query share("NoMessages", TRUE, Workers)`

			`/*`
			`Schemas are:`

			`----`
			`NodesP(Id: int, Partition: int, Value: point)`
			`EdgesP(Id: int, Partition: int, Value: point, Id_t: int, Partition_t: int,`
			`Value_t: point, Distance: real)`

			`Messages(NodeId: int, NodePartition: int, Sender: int, SenderPartition: int,`
			`Message: string, Value1: point, Value2: real)`
			`----`

			`1.6 The Compute Function`

			`*/`
			`let Compute = fun (messages: stream(tuple([NodeId: int, NodePartition: int,`
			`Sender: int, SenderPartition: int, Message: string, Value1: point,`
			`Value2: real])))`
			`messages mconsume`
			`within[fun(msgs: ANY)`
			`msgs mfeed extract[Message]`
			`switch[`
			`"m3",`
			`msgs mfeed`
			`loopsel[fun(m: TUPLE)`
			`G mg3successors[attr(m, NodeId)]`
			`projectextend[; NodeId: attr(m, Sender), NodePartition: attr(m,`
			`SenderPartition),`
			`Sender: .Id_t, SenderPartition: .Partition_t,`
			`Message: "m5", Value1: .Value_t, Value2:`
			`distance(attr(m, Value1), .Value_t, wgs84)]`
			`G mg3successors[attr(m, NodeId)]`
			`projectextend[; NodeId: .Id_t, NodePartition: .Partition_t,`
			`Sender: attr(m, Sender), SenderPartition: attr(m,`
			`SenderPartition),`
			`Message: "m5", Value1: attr(m, Value1), Value2:`
			`distance(attr(m, Value1), .Value_t, wgs84)]`
			`concat`
			`]`
			`; "m5",`
			`msgs mfeed`
			`loopjoin[Nodes_Id Nodes mexactmatch[.NodeId] addid {n}]`
			`filter[.Active_n]`
			`sortby[NodeId]`
			`groupby[NodeId, TID_n`
			`; NInserted: group feed`
			`projectextend[; Id: .NodeId, Partition: .NodePartition,`
			`Value: .Value_n, Id_t: .Sender, Partition_t: .SenderPartition,`
			`Value_t: .Value1, Distance: .Value2]`
			`filter[.Id # .Id_t]`
			`G mg3successors[group feed extract[NodeId], TRUE]`
			`concat`
			`sortby[Distance] rdup head[k] mg3insert[G] count]`
			`Nodes mupdatedirect2[TID_n; Iterations: ..Iterations + 1,`
			`SumDNew: G mg3successors[.NodeId] sum[Distance],`
			`Active: ..SumDNew < ..SumD, SumD: ..SumDNew]`
			`loopsel[fun(m2: TUPLE)`
			`G mg3successors[attr(m2, Id)]`
			`projectextend[; NodeId: .Id_t, NodePartition: .Partition_t,`
			`Sender: .Id, SenderPartition: .Partition, Message: "m3",`
			`Value1: .Value, Value2: 0.0]`
			`]`
			`; NoMessages()`
			`]]`

			`query share("Compute", TRUE, Workers)`

			`/*`
			`The initial messages are set up so that each node sends a message ~m3~ to its successors in the random graph.`

			On receiving ~m3~ messages, each node determines its own edges to successors. From successor positions it computes the distance to the node from which it received the ~m3~ message and sends it back to that node as an ~m5~ message. It also sends back successor positions so that the sender node can construct an edge from itself to the successor. Further, it sends a message to successors to create the corresponding directed edge from the successor to the sender (to have bi-directional or undirected edges).

			`On receiving a set of ~m5~ messages, each node determines the ~k~ edges with the smallest distance among the existing edges and those suggested by the ~m5~ messages. The ~k~ edges are replaced by this set.`

			`In addition, a node observes the sum of distances to its successors. If this distance changed in this round (i.e., the set of edges changed), then the node is set to be further active, else not active. Only active nodes process ~m5~ messages.`

			`In processing ~m5~ messages, nodes also create new ~m3~ messages for the next step.`

			`Some technical comments:`

			`* The function needs to read the stream of incoming messages twice, once to determine the type of mesages in this round (extract), and then to process this set of messages. The repeated reading of the stream does not work correctly in Secondo-Pregel at the moment, because read messages are deleted and will not appear again. This behaviour may be corrected in the near future.`

			`At the moment, the problem is circumvented by storing the stream within the function in a main memory relation, using mconsume and the within operator. In this way the argument stream can be read repeatedly.`

			* The mupdatedirect2 operator allows one to update tuples in a main memory relation using attribute values from the incoming tuple stream as well as from the stored tuple. The latter attributes are accessed by the ``..'' notation, as usual in Secondo for functions with two arguments (such as symmjoin, for example). The output from the operator is the stream of updated tuples.

			`2 Data Distribution`

			`*/`
			`let NodesSD = NodesP feed ddistribute2["", Partition, NWorkers, Workers]`
			`makeSimple[FALSE, "NodesPersistent"]`

			`let EdgesSD = EdgesP feed ddistribute2["", Partition, NWorkers, Workers]`
			`makeSimple[FALSE, "EdgesPersistent"]`