Slow filling graph

elastica · November 24, 2018, 6:29pm

Hi

I was experimenting Kruskal performance against a data file for 100 graphs (n, m<20000) :

# =================================================
from time import clock
from sys import stderr, stdin

import graph_tool as gt
from graph_tool.topology import min_spanning_tree

import networkx as nx

# ------------ NetworkX Code ---------------------------

def kruskal_networkX(edges, n):
    G=nx.Graph()
    for a, b, w in edges:
        G.add_edge(a,b,weight=w)
    return sum(d['weight'] for (_,_,d) in nx.minimum_spanning_tree(G).edges(data=True))

# ------------ Graph_tool Code ---------------------------

def kruskal_gt(wedges,n):
    g= gt.Graph(directed=False)
    edges, weights= zip(*[((a,b),w) for (a,b,w) in wedges])
    weight = g.new_edge_property("long long")
    for a,b,w in wedges:
        e=g.add_edge(a,b)
        weight[e]=w
    tree=min_spanning_tree(g, weights=weight)
    return sum(b*weight[e] for (e,b) in zip(g.edges(), tree))

# ------------ Benchmark ---------------------------

def go(solver, L):
global duration

N=len(L)
k=1

    while k<N:
        edges=[]
        n=L[k]
        k+=1

        for a in range(n):
            d=L[k]
            k+=1
            for j in range(d):
                b=L[k]
                k+=1
                w=L[k]
                k+=1
                if a<b:
                    edges.append([a,b-1,w])
        start=clock()
        print(solver(edges, n))
        print("----------------------------")
        duration+=clock()-start

# data
L=list(int(z) for z in stdin.read().split() if z.isdigit())

for solver in [kruskal_networkX, kruskal_gt]:
    duration=0
    go(solver, L)
    print("%s : %.3f" %(solver.__name__, duration), file=stderr)

# =================================================

Outputs are in agreement but gt execution is very slow:

$ python3 gt.py < big.in > bench.out
kruskal_networkX : 15.721
kruskal_gt : 134.839

the bottleneck being within the following loop-filling the graph:

# --------------------------------------------
    for a,b,w in wedges:
        e=g.add_edge(a,b)
        weight[e]=w
# --------------------------------------------

Probably, i'm filling the graph in the wrong way.

You can dowload the data file here:
https://drive.google.com/file/d/1y0fX1kopgzQEWgmRHRnouWHnMkuliwfw/view?usp=sharing

solstag · November 27, 2018, 1:52am

Ni! Hi elastica,

In graph-tool you can efficiently add edges with properties using the
`add_edge_list` function:

https://graph-tool.skewed.de/static/doc/graph_tool.html#graph_tool.Graph.add_edge_list

I'd also suggest that you benchmark python code with the `cProfile` module,
it is much easier and more informative:

https://docs.python.org/3/library/profile.html

Cheers,
ale
.~´

attachment.html (4.45 KB)

elastica · November 29, 2018, 9:20am

Hi Alexandre and thanks for your useful response

The *add_edge_list* internally changes edges ordering to the lexicographic
one so in order to set weights, I have to sort the weighted-edge list,
adding some little overhead. On the other hand, *add_edge_list()* is much
more efficient than adding edges one by one with the add_edge method (about
7 times faster!). Unfortunately, this is still slow, even slower than
NetworkX (about 1.2 times slower and we all know that NetworkX highest
quality is not speed).

Here is the new version:

# ***********************
from time import clock
from sys import stderr, stdin

import graph_tool as gt
from graph_tool.topology import min_spanning_tree

import networkx as nx

# ------------ NetworkX Code ---------------------------

def kruskal_networkX(edges, n):
    G=nx.Graph()
    for a, b, w in edges:
        G.add_edge(a,b,weight=w)
    return sum(d['weight'] for (_,_,d) in
nx.minimum_spanning_tree(G).edges(data=True))

# ------------ Graph_tool Code ---------------------------

def kruskal_gt(wedges,n):
    wedges.sort()
    g= gt.Graph(directed=False)
    edges, weights= zip(*[((a,b),w) for (a,b,w) in wedges])
    weight = g.new_edge_property("long long")
    g.add_edge_list(edges)
    for e,(_,_,w) in zip(g.edges(), wedges):
        weight[e]=w
    tree=min_spanning_tree(g, weights=weight)
    return sum(b*weight[e] for (e,b) in zip(g.edges(), tree))

# ------------ Benchmark ---------------------------

def go(solver, L):
global duration

N=len(L)
k=1

    while k<N:
        edges=[]
        n=L[k]
        k+=1

        for a in range(n):
            d=L[k]
            k+=1
            for j in range(d):
                b=L[k]
                k+=1
                w=L[k]
                k+=1
                if a<b:
                    edges.append([a,b-1,w])
        start=clock()
        print(solver(edges, n))
        duration+=clock()-start

# data
L=list(int(z) for z in stdin.read().split() if z.isdigit())

for solver in [kruskal_networkX, kruskal_gt]:
    duration=0
    go(solver, L)
    print("----------------------------")
    print("%s : %.3f" %(solver.__name__, duration), file=stderr)

# ***********************

$ python3 gt_v2.py < big.in > gt.out
kruskal_networkX : 15.851
kruskal_gt : 19.713

tiago · November 29, 2018, 9:56am

The *add_edge_list* internally changes edges ordering to the lexicographic
one so in order to set weights, I have to sort the weighted-edge list,
adding some little overhead.

That's not true; no re-ordering is performed by add_edge_list().

On the other hand, *add_edge_list()* is much
more efficient than adding edges one by one with the add_edge method (about
7 times faster!). Unfortunately, this is still slow, even slower than
NetworkX (about 1.2 times slower and we all know that NetworkX highest
quality is not speed).

That's because you are looping over the edges _twice_. Take a more careful
look at the documentation for add_edge_list(), and you find that it accepts
an "eprops" parameters, which lists the edge property maps that need to be
filled as well. Just pass your weights to this parameter, and feed it a list
of (source, target, weight).

For it to be even faster, you can make your edge list be a Numpy array ---
in which case the entire loop will be in C++.

elastica · November 29, 2018, 1:59pm

Hi,

Is the following code correspond to the code you have in mind:

?

tiago · November 29, 2018, 2:12pm

It seems you forgot to actually paste the code.

elastica · November 29, 2018, 3:26pm

Oops, that's infortunate, sorry for the confusion, yet I pasted the code
inside a raw text tag.

Again :

elastica · November 29, 2018, 3:51pm

Again, doesn't work but the code was visible in the preview.

Here the code outside any tag:

def kruskal_gt(wedges,n):
    g= gt.Graph(directed=False)
    weight = g.new_edge_property("long long")
    g.add_edge_list(np.array(wedges), eprops=[weight])
    tree=min_spanning_tree(g, weights=weight)
    return sum(b*weight[e] for (e,b) in zip(g.edges(), tree))

tiago · November 29, 2018, 3:59pm

Yes, this is what I meant. Note that you can still remove the last loop altogether by accessing the property maps as arrays:

\(tree\.a \* weight\.a\)\.sum\(\)

Best,
Tiago

elastica · November 30, 2018, 6:19am

Note that you can still remove the last loop altogether by accessing the

property maps as arrays: (tree.a * weight.a).sum()

This innocuous remark leads in fact to a huge speedup: you gain a factor 13,
a lot of surprise! Now, Graph Tool has the best performance among Networkit,
Igraph, Scipy and NetworkX of course. However, to put in perspective, Graph
Tool is only 2 times faster than pure Python code where Kruskal is
implemented with a basic Union-Find, we are far from genuine C/C++
performance, because usually graph implementations are 20 times faster in
C/C++ than pure Python.

Below, the complete benchmark. Note that Scipy code has the potential to
become the faster, the problem with scipy graphs is the need to convert
adjacency lists (or edges list) to Compressed Sparse Row and you have to
write it in Python, this is VERY slow.

# ***********************

from time import clock
from sys import stderr, stdin

import networkit as nk
from networkit.graph import RandomMaximumSpanningForest

from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse import csr_matrix

from igraph import Graph

import graph_tool as gt
from graph_tool.topology import min_spanning_tree
import numpy as np

# ------------ Pure Python Code ---------------------------

def find(s, parents):
    ups=[s]
    while True:
        up=parents[s]
        if up ==None:
            break
        s=up
        ups.append(s)
    for v in ups[:-1]:
        parents[v]=s
    return s

def merge(s, t, parents, heights):
    rep_s=find(s, parents)
    rep_t=find(t, parents)

    if rep_s==rep_t:
        return False
    height_s=heights[rep_s]
    height_t=heights[rep_t]
    if height_s < height_t:
        parents[rep_s]=rep_t
    else:
        parents[rep_t]=rep_s
        if height_s==height_t:
            heights[rep_s]+=1
    return True

def kruskal_python(edges, n):
    edges.sort(key=lambda t:t[2])
    parents=[None]*n
    heights=[0]*n
    k=0
    cost=0
    cnt=0
    while cnt<n-1:
        a, b, w=edges[k]
        k+=1
        if merge(a,b, parents, heights):
            cost+=w
            cnt+=1
    return cost

# ------------ Graph_tool Code ---------------------------

def kruskal_gt_numpy_sum(wedges,n):
    g= gt.Graph(directed=False)
    weight = g.new_edge_property("long long")
    g.add_edge_list(np.array(wedges), eprops=[weight])
    tree=min_spanning_tree(g, weights=weight)
    return (tree.a * weight.a).sum()

def kruskal_gt(wedges,n):
    g= gt.Graph(directed=False)
    weight = g.new_edge_property("long long")
    g.add_edge_list(np.array(wedges), eprops=[weight])
    tree=min_spanning_tree(g, weights=weight)
    return sum(b*weight[e] for (e,b) in zip(g.edges(), tree))

# ------------ Igraph Code ---------------------------

def kruskal_igraph(wedges, n):
    edges, weights= zip(*[((a,b),w) for (a,b,w) in wedges])
    g = Graph(n, edges=list(edges))
    g.es["weight"] = weights
    tree=g.spanning_tree(weights)
    return sum(tree.es["weight"])

# ------------ Scipy Code ---------------------------

def wedges2adj(edges, n):

    G=[for _ in range(n)]
    for a, b, w in edges:
        G[a].append((b,w))
        G[b].append((a,w))
    return G

def wedges2scr(edges, n):
    G=wedges2adj(edges, n)
    indptr=[0] # effectifs cumulés
    cnt=0

    for line in G:
        cnt+=len(line)
        indptr.append(cnt)
    data=
    indices=
    for i in range(n):
        for (j,w) in G[i]:
            data.append(w)
            indices.append(j)

    return [data, indptr, indices]

def csr2wedges(Mcsr, shape):
    n, p=shape
    k=0
    edges=
    data, cumul, cols = Mcsr.data,Mcsr.indptr, Mcsr.indices
    for i in range(n):
        for j in range(cumul[i+1]-cumul[i]):
            edges.append((i, cols[k], data[k]))
            k+=1
    return edges

def kruskal_scipy(edges, n):
    data, indptr, indices=wedges2scr(edges, n)
    csr=csr_matrix((data, indices, indptr), shape=(n, n))
    tree=minimum_spanning_tree(csr)
    edges=csr2wedges(tree, (n,n))
    return sum(int(round(w)) for (a,b,w) in edges)

# ------------ Networkit Code ---------------------------

def kruskal_networkit(edges, n):
    G=nk.Graph(n, weighted=True)
    for u, v,w in edges:
        G.addEdge(u,v,-w)
    msf=RandomMaximumSpanningForest(G)
    msf.run()
    weights=
    msf.getMSF().forEdges(lambda u, v, w, eid : weights.append(-w))
    return round(sum(weights))

# ------------ NetworkX Code ---------------------------

def kruskal_networkX(edges, n):
    import networkx as nx
    G=nx.Graph()
    for a, b, w in edges:
        G.add_edge(a,b,weight=w)
    return sum(d['weight'] for (_,_,d) in
nx.minimum_spanning_tree(G).edges(data=True))

# ------------ Benchmark ---------------------------

def go(solver, L):
global duration

N=len(L)
k=1

    while k<N:
        edges=
        n=L[k]
        k+=1

        for a in range(n):
            d=L[k]
            k+=1
            for j in range(d):
                b=L[k]
                k+=1
                w=L[k]
                k+=1
                if a<b:
                    edges.append([a,b-1,w])
        if solver==kruskal_scipy:
            data, indptr, indices=wedges2scr(edges, n)

            start=clock()
            csr=csr_matrix((data, indices, indptr), shape=(n, n))
            tree=minimum_spanning_tree(csr)
            edges=csr2wedges(tree, (n,n))
            duration+=clock()-start
        else:

            start=clock()
            print(solver(edges, n))
            duration+=clock()-start

# data
L=list(int(z) for z in stdin.read().split() if z.isdigit())

for solver in [kruskal_networkit, kruskal_gt_numpy_sum,
kruskal_gt,kruskal_python, kruskal_igraph, kruskal_scipy,
kruskal_networkX]:
    duration=0
    go(solver, L)
    print("%s : %.3f" %(solver.__name__, duration), file=stderr)

# ***********************

Here is the code to generate data tests:

# ***********************
from random import randrange

def random_tree(n):
    edges=
    for a in range(1,n):
        b=randrange(a)
        edges.append((a,b))
    return edges

def random_connected_graph(n, m):
    edges={(a,b) if a<b else (b,a) for (a, b) in random_tree(n)}
    while len(edges)<m:
        a=randrange(n)
        b=randrange(n)
        t=tuple(sorted([a,b]))
        if t!=(a,a) and t not in edges:
            edges.add(t)
    return [(a,b,randrange(1,10**4)) for (a,b) in edges]

def generate_testfile(ntest, maxi, filename="big.in"):
    # test file for Blinnet problem
    # www.spoj.com/problems/BLINNET/

    testfile=open(filename, "w")

    out=["%s" %ntest]
    for i in range(ntest):

        n=randrange(5,MAXI+1)
        m=randrange(n-1,min(MAXI, n*(n-1)//2))
        G=random_connected_graph(n, m)
        L=[ for _ in range(n)]
        for a,b,p in G:
            L[a].append((b, p))
            L[b].append((a, p))
        temp=
        temp.append("%s" %n)
        for i in range(n):
            temp.append("fake%s" %(i+1))
            temp.append("%s" %len(L[i]))
            for b, p in L[i]:
                temp.append("%s %s" %(b+1, p))
        out.append('\n'.join(temp))
    testfile.write('\n\n'.join(out))

MAXI=30000
ntest=100
filename="big.in"
generate_testfile(ntest, MAXI, filename)
print("Test %s generated" %filename)

# ***********************