Hi Prof. Peixoto,
I am saving and loading NestedBlockState state
and ModeClusterState pmode
in a pickle file, the newer version of graph-tools takes a lot of time to load the pickle file.
OS: ubuntu 22
graph-tools versions: 2.59, 2.72, 2.75
MWE:
from scipy.special import gammaln
import graph_tool.all as gt
import numpy as np
import pickle
def nested_mcmc_eq(args, g, state):
bs = []
Bs = [np.zeros(g.num_vertices() + 1) for s in state.get_levels()]
Bes = [[] for s in state.get_levels()]
dls = []
def collect_partitions(s):
bs.append(s.get_bs())
for l, sl in enumerate(s.get_levels()):
B = sl.get_nonempty_B()
Bs[l][B] += 1
Be = sl.get_Be()
Bes[l].append(Be)
dls.append(s.entropy())
gt.mcmc_equilibrate(
state,
wait=args.wait,
force_niter=args.force_niter,
mcmc_args=dict(niter=args.niter),
callback=collect_partitions,
)
return state, bs, Bs, Bes, dls
def nested_total_evidence(pmode, bs, dls):
H = pmode.posterior_entropy()
logB = np.mean([sum(gammaln(len(np.unique(bl))+1) for bl in b) for b in bs])
L = -np.mean(dls) + logB + H
return L
print(gt.__version__)
class ARGS(): pass
args = ARGS()
args.SEED = 100
gt.seed_rng(args.SEED)
np.random.seed(args.SEED)
g = gt.load_graph('graph.gt.gz')
state = gt.NestedBlockState(g, deg_corr=False)
args.wait = 100
args.force_niter = 1000
args.niter = 10
state, bs, Bs, Bes, dls = nested_mcmc_eq(args, g, state)
pmode = gt.ModeClusterState(bs, nested=True)
gt.mcmc_equilibrate(pmode, wait=1, mcmc_args=dict(niter=1, beta=np.inf))
L = nested_total_evidence(pmode, bs, dls)
with open(f'output_{gt.__version__}.pkl', 'wb') as f:
pickle.dump([g, state, bs, Bs, Bes, dls, pmode, L], f)
with open(f'output_{gt.__version__}.pkl', 'rb') as f:
[g, state, bs, Bs, Bes, dls, pmode, L] = pickle.load(f)
The table shows the timings between different versions:
| | version 2.59 | version 2.72 | version 2.75 |
| --- | --- | --- | --- |
| equilibrate | 18.7s | 21.0s | error |
| pmodes | 54.7s | *7.5s* | |
| evidence | 0.0s | 0.5s | |
| saving | 0.1s | 0.2s | |
| loading | 0.2s | *2.3s* | |
The error for version 2.75 is:
NameError: name 'log' is not defined
The problem with timing is 2.72 is very fast in estimating partition modes, but it takes too much time to load after it is saved.
The MWE has a small MCMC chain (1000 epochs), but for longer chains (100,000 epochs) the loading time in version 2.59 is within 1 minute and for version 2.72 reaches atleast 3-4 minutes.
Is there any efficient way to save and load the variables?
I was trying with
import dill as pickle
but it did not give much improvement.
Many thanks,
Govinda