Muhammad Bayat
Muhammad Bayat

Reputation: 11

Community tracking using ALPA algorithm problem

In this code I want to track the largest community in a network. The network has multiple snapshots, I want to detect communities in the first snapshot usin "Louvain" algorithm and the select the largest community and track this community through successive snapshots using ALPA algorithm.

https://github.com/afternone/ALPA.jl

My problem is that after the second snapshot the functions "label_propagation" and "track_community_llp" select most of the nodes in the network as part of the largest community. Can any one help me solve this issue?

"import pandas as pd
import networkx as nx
import community as community_louvain
from collections import defaultdict
import random
import math


# Load the dataset from an Excel file
def load_edge_list(file_path):
    df = pd.read_excel(file_path)
    return df

# Create a graph from the first snapshot and detect communities using Louvain
def detect_communities_louvain(df, year):
    # Filter edges for the first year (snapshot)
    df_snapshot = df[df['Snapshot'] <= year]
    
    # Create a graph from the edge list
    G = nx.from_pandas_edgelist(df_snapshot, 'Source', 'Target')
    
    # Apply Louvain algorithm for community detection
    partition = community_louvain.best_partition(G)
    
    # Convert the partition dictionary into a community-to-nodes mapping
    communities = defaultdict(list)
    for node, comm_id in partition.items():
        communities[comm_id].append(node)
    
    # Select the largest community
    largest_community = max(communities.values(), key=len)
    return G, largest_community

# Apply Local Label Propagation (LLP) for updating community labels in subsequent snapshots
def label_propagation(G, active_node_list):
    labels = {node: node for node in G.nodes()}  # Initialize each node with its own label
    
    while active_node_list:
        node = active_node_list.pop(random.randint(0, len(active_node_list)-1))  # Pick a random active node
        
        if node not in G.nodes():  # Skip nodes that are no longer in the graph
            continue
        
        # Count the labels of the node's neighbors
        neighbor_labels = [labels[neighbor] for neighbor in G.neighbors(node) if neighbor in labels]
        if neighbor_labels:
            # Assign the node the label that most of its neighbors have
            most_common_label = max(set(neighbor_labels), key=neighbor_labels.count)
            if labels[node] != most_common_label:
                labels[node] = most_common_label
                # Activate the neighbors if the label of the current node changes
                active_node_list.extend([n for n in G.neighbors(node) if n not in active_node_list])
    
    # Return the updated labels
    return labels

# Track the largest community across the snapshots using LLP
def track_community_llp(df, years, initial_community):
    tracked_communities = {years[0]: initial_community}
    
    for year in years[1:]:
        # Filter edges for the current year (add incremental edges)
        df_snapshot = df[df['Snapshot'] <= year]
        
        # Create a new graph from the current snapshot
        G = nx.from_pandas_edgelist(df_snapshot, 'Source', 'Target')
        
        # Initialize the active node list with the largest community from the previous snapshot
        active_node_list = [node for node in initial_community if node in G.nodes()]
        
        # Perform LLP for community updates
        labels = label_propagation(G, active_node_list)
        
        # Track nodes that are still in the largest community by their labels
        tracked_community = [node for node in labels if labels[node] == labels.get(node, None)]
        tracked_communities[year] = tracked_community
        
        # Update the initial community for the next snapshot
        initial_community = tracked_community

    return tracked_communities


# Function to perturb the community by removing the highest internal degree node
def singleNode_perturbation(tracked_communities, G_snapshots):
    removed_node_degrees = {}  # Dictionary to store the internal degrees of removed nodes
    
    # Loop over each year to identify and remove the highest internal degree node separately for each snapshot
    for year, community in tracked_communities.items():
        G_snapshot = G_snapshots[year]
        community_subgraph = G_snapshot.subgraph(community)
        
        # Compute the internal degrees for each node in this snapshot
        internal_degrees = {node: sum(1 for _ in community_subgraph.neighbors(node)) for node in community_subgraph.nodes()}
        
        # Find the node with the highest internal degree in this snapshot
        if internal_degrees:  # Check if the community is not empty
            highest_internal_degree_node = max(internal_degrees, key=internal_degrees.get)
            removed_node = highest_internal_degree_node
            removed_node_degrees[year] = internal_degrees[removed_node]
            
            # Remove this node from the community in the current snapshot
            tracked_communities[year].remove(highest_internal_degree_node)
                
    return tracked_communities, removed_node, removed_node_degrees

# Main Function
def main():
    # Input and output file paths
    input_file = r"C:\Users\Muhammad\Desktop\Test Simulation2\Facebook\Facebook_edges.xlsx"   # Replace with the actual path to your input Excel file
    output_data = []
    
    # Load the dataset
    df = load_edge_list(input_file)
    
    # Get the unique list of years (timestamps) sorted in ascending order
    years = sorted(df['Snapshot'].unique())
    
    # Detect communities in the first snapshot using Louvain
    G, largest_community = detect_communities_louvain(df, years[0])
    
    # Track the largest community through subsequent snapshots using LLP
    tracked_communities = track_community_llp(df, years, largest_community)
    
    # Store snapshots for all years
    G_snapshots = {}
    for year in years:
        df_snapshot = df[df['Snapshot'] <= year]
        G_snapshots[year] = nx.from_pandas_edgelist(df_snapshot, 'Source', 'Target')
    
    # Save the tracked community to an Excel file
    for year in years:
        # Get the tracked community for the current snapshot
        community_nodes = tracked_communities.get(year, [])
        community_subgraph = G_snapshots[year].subgraph(community_nodes)
        adj_matrix = nx.to_numpy_array(community_subgraph).astype(int)
        
        # Compute the metrics
        best_comm_size = len(community_nodes)  # Number of nodes in the selected community
        best_comm_edges = community_subgraph.number_of_edges()  # Number of edges in the selected community
        density = nx.density(community_subgraph) if best_comm_size > 1 else 0  # Density of the community
        avg_degree = sum(dict(community_subgraph.degree()).values()) / best_comm_size if best_comm_size > 0 else 0  # Average degree of the community
        
        
        # Perform perturbation and recalculate metrics
        perturbed_community, removed_node, removed_node_degrees = singleNode_perturbation(tracked_communities, G_snapshots)
        
        removed_node_degree = removed_node_degrees.get(year, 0)
        
        # Recreate the perturbed community subgraph by recalculating after node removal
        perturbed_community_nodes = perturbed_community.get(year, [])
        perturbed_subgraph = G_snapshots[year].subgraph(perturbed_community_nodes)
        adj_matrix_perturbed = nx.to_numpy_array(perturbed_subgraph).astype(int)
        
        perturbed_community_size = len(perturbed_community_nodes)
        perturbed_community_edges = perturbed_subgraph.number_of_edges()
                
                             
        # Add the data for the current snapshot
        output_data.append({
            'Snapshot': year,
            'Size': len(G_snapshots[year].nodes()),     # Total number of nodes up to this snapshot 
            'Edges': len(G_snapshots[year].edges()),  # Total number of edges up to this snapshot
            'BestCom_Size_be': best_comm_size,
            'Density': density,
            'AverageD': avg_degree,
            'Rmv_NodeID': removed_node,
            'Rmv_Node_Deg': removed_node_degree,
            'BestCom_Edges_be': best_comm_edges,
            'BestCom_Edges_af': perturbed_community_edges            
                     
        })
    
    df_results = pd.DataFrame(output_data)
    df_results.to_excel(r"C:\Users\Muhammad\Desktop\Test Simulation2\Test Results\Results1.xlsx", sheet_name='Results', index=False)

# Run the main function
if __name__ == "__main__":
    main()
"

Upvotes: 0

Views: 12

Answers (0)

Related Questions