Evangelia146
Evangelia146

Reputation: 11

Issue with histogram creation

I have a problem with a histogram code.Sorry if the question is simple I am new

I have this table:

enter image description here

And a file directory with files in this format: en_Brain_Amygdala.db_brain_predict_1000.txt_european_brain_predict_1000 (I know the name is kinda of a mess.Its an issue with another code). These files have two columns.One gene name and one weight numbers

What I am trying to do is to use these files to create histograms for a specific group of genes(meaning tissue\gene separate histograms) using the weight numbers and add as a red line in the histogram the value from the table.I am submitting what I have built so far

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from concurrent.futures import ProcessPoolExecutor

# Parameters and directories
Pnr = 2
part_dir = f"/mnt/hdd_3tb/GenomeData/Participant00{Pnr}/Results/1000genomes_PrediXcan/"
output_dir = f"/mnt/hdd_3tb/GenomeData/Participant00{Pnr}/Pictures/"
df_genes = pd.read_csv('/mnt/internserver1_data/ReportDatabase/UniProt/gene_list.csv', sep=',')

# Extract SCN-related genes
df_ENSG = df_genes[df_genes['gene'].str.contains('SCN', na=False)]
df_ENSG_dict = df_ENSG.set_index('gene')['ENSG'].to_dict()

# Match prediction and header files
brain_dbs = [
    f for f in os.listdir(part_dir)
    if f.startswith("en_Brain") and f.endswith("european_brain_predict_1000")
]

header_dbs = [
    f for f in os.listdir(part_dir)
    if f.startswith("en_Brain") and f.endswith("brain_predict_1000.txt")
]

# Create a mapping from prediction files to their corresponding header files
def normalize_name(filename):
    return filename.replace("_european_brain_predict_1000", "").replace(".db_brain_predict_1000.txt", "").replace(".txt", "")

# Create a mapping from header base names to full filenames
header_dict = {
    normalize_name(header): header for header in header_dbs
}


# Load the dataframe for the histograms with red lines
dataframe_path = f"/mnt/hdd_3tb/GenomeData/Participant00{Pnr}/Results/filtered_df_all.csv"
df_all = pd.read_csv(dataframe_path, index_col=0)
df_all.columns = [col.split('.')[0] for col in df_all.columns]  # Clean gene names (remove version)
# Function to generate histogram for each file
def generate_histogram(db_file,df, output_dir, df_all, df_ENSG_dict):
   db_base_name = db_file.replace("_european_brain_predict_1000", "")
   red_line_plotted_for_file = False
    
   for gene, ENSG in df_ENSG_dict.items():
        
        # Check if ENSG is part of any column name in the dataframe (as a substring match)
        matching_columns = [col for col in df.columns if ENSG[:10] in col]  # Partial matching using the first 10 characters of the ENSG ID
        
        if matching_columns:
            for col in matching_columns:
                column_data = df[col]
          
                # Create the histogram plot and save it directly if the gene matches
                fig, ax = plt.subplots()

                ax.hist(column_data)
                ax.set_xlabel('Predicted Gene Expression')
                ax.set_ylabel('Number of People')

                # Check if we need to add a red line
                if gene in df_all.columns:
                    tissue_name_from_file = db_base_name.split('_')[1]
                    matched_tissues = [tissue for tissue in df_all.index if tissue_name_from_file in tissue]
                    # Debugging: Check the tissues being matched
                    if matched_tissues:
                        matched_tissue = matched_tissues[0]
                        value = df_all.loc[matched_tissue, gene]
                        
                        try:
                            value = float(value)
                            if not pd.isna(value):
                                ax.axvline(x=value, color='red', linestyle='--', linewidth=2, label=f'Value: {value}')
                                ax.legend()
                                red_line_plotted_for_file = True
                        except ValueError:
                            print(f"Skipping non-numeric value for gene {gene} in {db_base_name}")
                            pass

                # Debug print to ensure saving
                if red_line_plotted_for_file:
                    output_path = os.path.join(output_dir, f"{db_base_name}_{col}_{gene}_histogram.png")
                    fig.savefig(output_path, bbox_inches='tight', pad_inches=0)
                    plt.close(fig)  # Close the figure after saving
                else:
                    print(f"No red line plotted for {gene} in {db_base_name}, skipping save.")

# Function to read and process each file
def process_files(db_file):
    db_base_name = normalize_name(db_file)
    header_filename = header_dict.get(db_base_name)
    
    if header_filename:
        header_file_path = os.path.join(part_dir, header_filename)
        if os.path.exists(header_file_path):
           df_header = pd.read_csv(header_file_path, sep='\t', nrows=0)
           column_names = df_header.columns
        else:
            print(f"Header file {header_file_path} does not exist!")
            return
    else:
        print(f"Header file for {db_file} not found in header_dict!")
        return

    # Read the prediction file
    data_file_path = os.path.join(part_dir, db_file)
    df = pd.read_csv(data_file_path, sep='\t', header=None)
    df.columns = column_names  # Apply headers

    # Call the function to generate histograms
    generate_histogram(db_file, df, output_dir, df_all, df_ENSG_dict)


# Use parallel processing to speed up file processing
with ProcessPoolExecutor() as executor:
   executor.map(process_files, brain_dbs)

print("Histograms with red lines have been generated and saved.")

Upvotes: 1

Views: 68

Answers (0)

Related Questions