Reputation: 140
I am attempting to implement the Louvain algorithm in python. On testing it on the Karate Club dataset, although there is a correct answer, it is not exactly the same as my slower implementation of the same, which instead of calculating the modularity change, calculates the modularity itself. The slower version works fine. However, this version does not work fine on my dataset and gets stuck in a loop of reducing and increasing the modularity.
I have fixed some bugs I the implementation but am unable to see what else I am doing wrong. The code is given below.
from graph_tool.all import *
class MyList:
def __init__(self, arr):
self.arr = arr
# https://stackoverflow.com/questions/1047021/overriding-in-python-iadd-method
# https://stackoverflow.com/questions/5082190/typeerror-after-overriding-the-add-method
def __add__(self,other_arr):
if type(other_arr) == list:
self.arr.extend(other_arr)
if type(other_arr) == MyList:
self.arr.extend(other_arr.arr)
return self
def __radd__(self,other_arr):
if type(other_arr) == list:
self.arr.extend(other_arr)
if type(other_arr) == MyList:
self.arr.extend(other_arr.arr)
return self
import time
from collections import Counter
from tabulate import tabulate
def getModularity(G, parts_idx, edge_weights):
return modularity(G, parts_idx, weight=edge_weights)
def moveNodes(G, parts_idx, edge_weights, vertex_degrees, total_edges):
# https://arxiv.org/pdf/1810.08473.pdf
count = 0
# Note that when we called this function, the vertex degrees passed here,
# are actually the sum totals of each node in a community giving us
# sum_tot as required for finding dQ. (See report)
# So we make a copy of the community degrees for us to modify, so as to
# speed up dQ calculations. Every time we move a vertex to a different
# community, we subtract and add its degree from the community it moved
# from and the community it moved to respectively.
community_tots = {}
for comm_idx,tot in zip(parts_idx.a, vertex_degrees.a):
community_tots[comm_idx] = tot
# Louvain's Part 1 Main Loop
while True:
# Get the modularity before we try to move each vertex.
mod_old = getModularity(G, parts_idx, edge_weights)
# Now go over each vertex to find it's best community
for v in G.iter_vertices():
# if (count == 2):
# print("mod:", getModularity(G, parts_idx, edge_weights))
# Keep track of communities we have to visit along with the number
# of edges between v and each community.
neighbour_communities = {parts_idx[v] : 0}
# Now go through all edges of v to find neighbouring communities.
for s, t, weight in G.iter_all_edges(v, eprops=[edge_weights]):
# We do not need to explicitly check for self loops as that is
# handled by the s == v check.
# Check if s is v
if s == v:
# https://stackoverflow.com/questions/42315072/python-update-a-key-in-dict-if-it-doesnt-exist
# Add community if not seen before
if parts_idx[t] not in neighbour_communities:
neighbour_communities[parts_idx[t]] = weight
# Else add the weight
else:
neighbour_communities[parts_idx[t]] += weight
# Do the same for t
if t == v:
# https://stackoverflow.com/questions/42315072/python-update-a-key-in-dict-if-it-doesnt-exist
# Add community if not seen before
if parts_idx[s] not in neighbour_communities:
neighbour_communities[parts_idx[s]] = weight
# Else add the weight
else:
neighbour_communities[parts_idx[s]] += weight
# The dQ if we do not change anything is 0.
curr_max_dQ = 0
# The current best swap is no swap as dQ would remain 0.
curr_best_swap = parts_idx[v]
# Precompute k_v/m
# precomp_coeff = vertex_degrees[v]*precomp_denom2
# Prefetch k_v_own
k_v_own = neighbour_communities[parts_idx[v]]
# Prefetch own community tot
own_community_tot = community_tots[parts_idx[v]]
# Now go through the neighbouring communities to maximize dQ.
# https://stackoverflow.com/questions/3294889/iterating-over-dictionaries-using-for-loops
for community, k_v_comm in neighbour_communities.items():
# Skip if it is our own community
if community == parts_idx[v]:
continue
# Calculate dQ
dQ = k_v_comm - vertex_degrees[v]*community_tots[community]/(2*total_edges) + vertex_degrees[v]*own_community_tot/(2*total_edges) - k_v_own
# print(community, dQ)
# Update our curr vals if dQ bigger than our curr max dQ
if dQ >= curr_max_dQ:
curr_max_dQ = dQ
curr_best_swap = community
# Now update v's community if we found a better one
if curr_max_dQ > 0:
# First update community_tots
community_tots[curr_best_swap] += vertex_degrees[v]
community_tots[parts_idx[v]] -= vertex_degrees[v]
# Now change v's community
parts_idx[v] = curr_best_swap
# print(parts_idx.a)
count += 1
mod_new = getModularity(G, parts_idx, edge_weights)
print(f"After moveNodes iteration {count}:", mod_new)
if mod_new == mod_old:
break
return len(set(list(parts_idx)))
def Louvain(G):
# https://arxiv.org/pdf/1810.08473.pdf
# Copy to prevent unintended changes
G_copy = G.copy()
# Create a list of MyLists where each MyList contains the list of vertices
# a community contains.
master_partition = G_copy.new_vp("object")
for idx,v in enumerate(G_copy.iter_vertices()):
master_partition[v] = MyList([G_copy.vertex_index[idx]])
# To keep track of the number communities
num_partitions = G_copy.num_vertices()
# To calculate dQ
total_edges = G_copy.num_edges()
# To keep track of community index for easy swaps in moveNodes()
partition_idxs = G_copy.new_vp("int")
for i in range(G_copy.num_vertices()):
partition_idxs[i] = i
curr_iteration = 0
# To keep track of number of intra- and inter-community edges
edge_weights = G_copy.new_ep("int")
edge_weights.set_value(1)
# Louvain's Phase-2 Loop
while True:
# To keep track of vertex degrees
vertex_degrees = G_copy.degree_property_map("total", weight=edge_weights)
# Get the current modularity
old_mod = getModularity(G_copy, partition_idxs, edge_weights)
# Move nodes between communitites to improve modularity
num_partitions = moveNodes(G_copy, partition_idxs, edge_weights, vertex_degrees, total_edges)
curr_iteration += 1
# Get the new modularity
new_mod = getModularity(G_copy, partition_idxs, edge_weights)
print(f"After iteration {curr_iteration}", new_mod)
# If the change in modularity was less than 1e-7, then stop
if abs(old_mod - new_mod) < 0.0000001:
break
# Else condense the communities into singular nodes
G_copy, partition_idxs, vcount, ecount, avp, aep = condensation_graph(G_copy, partition_idxs, avprops=[master_partition], aeprops=[edge_weights], self_loops=True)
# Get the new weights, partition sets and degrees
edge_weights = aep[0]
master_partition = avp[0]
final_part_idxs = G.new_vp("int")
for part_idx,part in enumerate(master_partition):
for node in part.arr:
final_part_idxs[node] = part_idx
return final_part_idxs
# g.set_directed(False)
# print(g.is_directed())
# part_idxs = Louvain(g)
# with open("louvaincomms.txt", "w") as f:
# print(part_idxs, file=f)
# Use the below instead of above to test Louvain on the famous Zachary's Karate Club Dataset
# Results are validated by https://scholarworks.sjsu.edu/cgi/viewcontent.cgi?article=1528&context=etd_projects
g = collection.data["karate"]
part_idxs = Louvain(g)
# https://stackoverflow.com/questions/12282232/how-do-i-count-occurrence-of-unique-values-inside-a-list
parts = Counter(part_idxs).keys() # equals to list(set(part_idxs))-
freq = Counter(part_idxs).values() # counts the elements' frequency
# https://stackoverflow.com/questions/41140647/python-printing-lists-with-tabulate
table = zip(parts, freq)
print(tabulate(table, headers=["Community Index", "Number of Nodes"]))
The output is
Community Index Number of Nodes
----------------- -----------------
0 11
1 2
2 3
3 10
4 4
5 4
Upvotes: 1
Views: 264