Reputation: 11
In this code I want to track the largest community in a network. The network has multiple snapshots, I want to detect communities in the first snapshot usin "Louvain" algorithm and the select the largest community and track this community through successive snapshots using ALPA algorithm.
https://github.com/afternone/ALPA.jl
My problem is that after the second snapshot the functions "label_propagation" and "track_community_llp" select most of the nodes in the network as part of the largest community. Can any one help me solve this issue?
"import pandas as pd
import networkx as nx
import community as community_louvain
from collections import defaultdict
import random
import math
# Load the dataset from an Excel file
def load_edge_list(file_path):
df = pd.read_excel(file_path)
return df
# Create a graph from the first snapshot and detect communities using Louvain
def detect_communities_louvain(df, year):
# Filter edges for the first year (snapshot)
df_snapshot = df[df['Snapshot'] <= year]
# Create a graph from the edge list
G = nx.from_pandas_edgelist(df_snapshot, 'Source', 'Target')
# Apply Louvain algorithm for community detection
partition = community_louvain.best_partition(G)
# Convert the partition dictionary into a community-to-nodes mapping
communities = defaultdict(list)
for node, comm_id in partition.items():
communities[comm_id].append(node)
# Select the largest community
largest_community = max(communities.values(), key=len)
return G, largest_community
# Apply Local Label Propagation (LLP) for updating community labels in subsequent snapshots
def label_propagation(G, active_node_list):
labels = {node: node for node in G.nodes()} # Initialize each node with its own label
while active_node_list:
node = active_node_list.pop(random.randint(0, len(active_node_list)-1)) # Pick a random active node
if node not in G.nodes(): # Skip nodes that are no longer in the graph
continue
# Count the labels of the node's neighbors
neighbor_labels = [labels[neighbor] for neighbor in G.neighbors(node) if neighbor in labels]
if neighbor_labels:
# Assign the node the label that most of its neighbors have
most_common_label = max(set(neighbor_labels), key=neighbor_labels.count)
if labels[node] != most_common_label:
labels[node] = most_common_label
# Activate the neighbors if the label of the current node changes
active_node_list.extend([n for n in G.neighbors(node) if n not in active_node_list])
# Return the updated labels
return labels
# Track the largest community across the snapshots using LLP
def track_community_llp(df, years, initial_community):
tracked_communities = {years[0]: initial_community}
for year in years[1:]:
# Filter edges for the current year (add incremental edges)
df_snapshot = df[df['Snapshot'] <= year]
# Create a new graph from the current snapshot
G = nx.from_pandas_edgelist(df_snapshot, 'Source', 'Target')
# Initialize the active node list with the largest community from the previous snapshot
active_node_list = [node for node in initial_community if node in G.nodes()]
# Perform LLP for community updates
labels = label_propagation(G, active_node_list)
# Track nodes that are still in the largest community by their labels
tracked_community = [node for node in labels if labels[node] == labels.get(node, None)]
tracked_communities[year] = tracked_community
# Update the initial community for the next snapshot
initial_community = tracked_community
return tracked_communities
# Function to perturb the community by removing the highest internal degree node
def singleNode_perturbation(tracked_communities, G_snapshots):
removed_node_degrees = {} # Dictionary to store the internal degrees of removed nodes
# Loop over each year to identify and remove the highest internal degree node separately for each snapshot
for year, community in tracked_communities.items():
G_snapshot = G_snapshots[year]
community_subgraph = G_snapshot.subgraph(community)
# Compute the internal degrees for each node in this snapshot
internal_degrees = {node: sum(1 for _ in community_subgraph.neighbors(node)) for node in community_subgraph.nodes()}
# Find the node with the highest internal degree in this snapshot
if internal_degrees: # Check if the community is not empty
highest_internal_degree_node = max(internal_degrees, key=internal_degrees.get)
removed_node = highest_internal_degree_node
removed_node_degrees[year] = internal_degrees[removed_node]
# Remove this node from the community in the current snapshot
tracked_communities[year].remove(highest_internal_degree_node)
return tracked_communities, removed_node, removed_node_degrees
# Main Function
def main():
# Input and output file paths
input_file = r"C:\Users\Muhammad\Desktop\Test Simulation2\Facebook\Facebook_edges.xlsx" # Replace with the actual path to your input Excel file
output_data = []
# Load the dataset
df = load_edge_list(input_file)
# Get the unique list of years (timestamps) sorted in ascending order
years = sorted(df['Snapshot'].unique())
# Detect communities in the first snapshot using Louvain
G, largest_community = detect_communities_louvain(df, years[0])
# Track the largest community through subsequent snapshots using LLP
tracked_communities = track_community_llp(df, years, largest_community)
# Store snapshots for all years
G_snapshots = {}
for year in years:
df_snapshot = df[df['Snapshot'] <= year]
G_snapshots[year] = nx.from_pandas_edgelist(df_snapshot, 'Source', 'Target')
# Save the tracked community to an Excel file
for year in years:
# Get the tracked community for the current snapshot
community_nodes = tracked_communities.get(year, [])
community_subgraph = G_snapshots[year].subgraph(community_nodes)
adj_matrix = nx.to_numpy_array(community_subgraph).astype(int)
# Compute the metrics
best_comm_size = len(community_nodes) # Number of nodes in the selected community
best_comm_edges = community_subgraph.number_of_edges() # Number of edges in the selected community
density = nx.density(community_subgraph) if best_comm_size > 1 else 0 # Density of the community
avg_degree = sum(dict(community_subgraph.degree()).values()) / best_comm_size if best_comm_size > 0 else 0 # Average degree of the community
# Perform perturbation and recalculate metrics
perturbed_community, removed_node, removed_node_degrees = singleNode_perturbation(tracked_communities, G_snapshots)
removed_node_degree = removed_node_degrees.get(year, 0)
# Recreate the perturbed community subgraph by recalculating after node removal
perturbed_community_nodes = perturbed_community.get(year, [])
perturbed_subgraph = G_snapshots[year].subgraph(perturbed_community_nodes)
adj_matrix_perturbed = nx.to_numpy_array(perturbed_subgraph).astype(int)
perturbed_community_size = len(perturbed_community_nodes)
perturbed_community_edges = perturbed_subgraph.number_of_edges()
# Add the data for the current snapshot
output_data.append({
'Snapshot': year,
'Size': len(G_snapshots[year].nodes()), # Total number of nodes up to this snapshot
'Edges': len(G_snapshots[year].edges()), # Total number of edges up to this snapshot
'BestCom_Size_be': best_comm_size,
'Density': density,
'AverageD': avg_degree,
'Rmv_NodeID': removed_node,
'Rmv_Node_Deg': removed_node_degree,
'BestCom_Edges_be': best_comm_edges,
'BestCom_Edges_af': perturbed_community_edges
})
df_results = pd.DataFrame(output_data)
df_results.to_excel(r"C:\Users\Muhammad\Desktop\Test Simulation2\Test Results\Results1.xlsx", sheet_name='Results', index=False)
# Run the main function
if __name__ == "__main__":
main()
"
Upvotes: 0
Views: 12