Reputation: 11
I have a problem with a histogram code.Sorry if the question is simple I am new
I have this table:
And a file directory with files in this format: en_Brain_Amygdala.db_brain_predict_1000.txt_european_brain_predict_1000
(I know the name is kinda of a mess.Its an issue with another code). These files have two columns.One gene name and one weight numbers
What I am trying to do is to use these files to create histograms for a specific group of genes(meaning tissue\gene separate histograms) using the weight numbers and add as a red line in the histogram the value from the table.I am submitting what I have built so far
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from concurrent.futures import ProcessPoolExecutor
# Parameters and directories
Pnr = 2
part_dir = f"/mnt/hdd_3tb/GenomeData/Participant00{Pnr}/Results/1000genomes_PrediXcan/"
output_dir = f"/mnt/hdd_3tb/GenomeData/Participant00{Pnr}/Pictures/"
df_genes = pd.read_csv('/mnt/internserver1_data/ReportDatabase/UniProt/gene_list.csv', sep=',')
# Extract SCN-related genes
df_ENSG = df_genes[df_genes['gene'].str.contains('SCN', na=False)]
df_ENSG_dict = df_ENSG.set_index('gene')['ENSG'].to_dict()
# Match prediction and header files
brain_dbs = [
f for f in os.listdir(part_dir)
if f.startswith("en_Brain") and f.endswith("european_brain_predict_1000")
]
header_dbs = [
f for f in os.listdir(part_dir)
if f.startswith("en_Brain") and f.endswith("brain_predict_1000.txt")
]
# Create a mapping from prediction files to their corresponding header files
def normalize_name(filename):
return filename.replace("_european_brain_predict_1000", "").replace(".db_brain_predict_1000.txt", "").replace(".txt", "")
# Create a mapping from header base names to full filenames
header_dict = {
normalize_name(header): header for header in header_dbs
}
# Load the dataframe for the histograms with red lines
dataframe_path = f"/mnt/hdd_3tb/GenomeData/Participant00{Pnr}/Results/filtered_df_all.csv"
df_all = pd.read_csv(dataframe_path, index_col=0)
df_all.columns = [col.split('.')[0] for col in df_all.columns] # Clean gene names (remove version)
# Function to generate histogram for each file
def generate_histogram(db_file,df, output_dir, df_all, df_ENSG_dict):
db_base_name = db_file.replace("_european_brain_predict_1000", "")
red_line_plotted_for_file = False
for gene, ENSG in df_ENSG_dict.items():
# Check if ENSG is part of any column name in the dataframe (as a substring match)
matching_columns = [col for col in df.columns if ENSG[:10] in col] # Partial matching using the first 10 characters of the ENSG ID
if matching_columns:
for col in matching_columns:
column_data = df[col]
# Create the histogram plot and save it directly if the gene matches
fig, ax = plt.subplots()
ax.hist(column_data)
ax.set_xlabel('Predicted Gene Expression')
ax.set_ylabel('Number of People')
# Check if we need to add a red line
if gene in df_all.columns:
tissue_name_from_file = db_base_name.split('_')[1]
matched_tissues = [tissue for tissue in df_all.index if tissue_name_from_file in tissue]
# Debugging: Check the tissues being matched
if matched_tissues:
matched_tissue = matched_tissues[0]
value = df_all.loc[matched_tissue, gene]
try:
value = float(value)
if not pd.isna(value):
ax.axvline(x=value, color='red', linestyle='--', linewidth=2, label=f'Value: {value}')
ax.legend()
red_line_plotted_for_file = True
except ValueError:
print(f"Skipping non-numeric value for gene {gene} in {db_base_name}")
pass
# Debug print to ensure saving
if red_line_plotted_for_file:
output_path = os.path.join(output_dir, f"{db_base_name}_{col}_{gene}_histogram.png")
fig.savefig(output_path, bbox_inches='tight', pad_inches=0)
plt.close(fig) # Close the figure after saving
else:
print(f"No red line plotted for {gene} in {db_base_name}, skipping save.")
# Function to read and process each file
def process_files(db_file):
db_base_name = normalize_name(db_file)
header_filename = header_dict.get(db_base_name)
if header_filename:
header_file_path = os.path.join(part_dir, header_filename)
if os.path.exists(header_file_path):
df_header = pd.read_csv(header_file_path, sep='\t', nrows=0)
column_names = df_header.columns
else:
print(f"Header file {header_file_path} does not exist!")
return
else:
print(f"Header file for {db_file} not found in header_dict!")
return
# Read the prediction file
data_file_path = os.path.join(part_dir, db_file)
df = pd.read_csv(data_file_path, sep='\t', header=None)
df.columns = column_names # Apply headers
# Call the function to generate histograms
generate_histogram(db_file, df, output_dir, df_all, df_ENSG_dict)
# Use parallel processing to speed up file processing
with ProcessPoolExecutor() as executor:
executor.map(process_files, brain_dbs)
print("Histograms with red lines have been generated and saved.")
Upvotes: 1
Views: 68