Reputation: 2412
I have the following Python code using PyVCF library. This script is taking long time to parse my VCF file.
Here is the script:
# =============================================================================
# import libraries for python
# =============================================================================
import vcf
import os.path
import psycopg2
import datetime
import sys
# =============================================================================
# function drops the previous table if exists and creates new table with dynamic table name
# =============================================================================
def create_table(table_name,table_type,conn_obj):
dbcur = conn_obj.cursor()
dbcur.execute("""DROP TABLE IF EXISTS {0};""".format(table_name))
if table_type == '1' :
dbcur.execute("""CREATE TABLE IF NOT EXISTS {0} (as_ID INT, as_NM TEXT, as_at_ID INT, as_at_NM TEXT, VCF_ID TEXT, VARIANT_ID TEXT, as_at_LINE_SEQ INT, DATE_START DATE, DATE_END DATE, as_at_VAL_SEQ INT, as_at_VA_NM TEXT, as_at_VALUE TEXT);""".format(table_name))
elif table_type == '2' :
dbcur.execute("""CREATE TABLE IF NOT EXISTS {0} (as_at_VA_NM TEXT, as_at_VA_DESC TEXT,as_at_DATA_TYPE TEXT);""".format(table_name))
conn_obj.commit()
message = 'Table '+table_name+' created successfully.\n'
print(message)
return table_name
# =============================================================================
# function for insert vcf files data in values table
# =============================================================================
def create_tuples(as_id, vcf_id, Variant_id, as_at_nm, sample_id, as_at_va_nm, as_at_value, date_start, date_end, as_va_seq, as_at_va_line_seq,data_table_name):
as_at_id = '1'
sample_id = '1'
variant_id = '1'
as_nm = 'VCF'
datalist.append("({0},'{1}','{2}','{3}','{4}',{5},'{6}','{7}','{8}','{9}')".format(as_id,str(as_nm),as_at_id,as_at_nm,"",variant_id,as_at_va_line_seq,as_va_seq,as_at_va_nm,as_at_value))
if len(datalist)==200000:
insertdata(data_table_name,as_id)
def insertdata(data_table_name,as_id):
global datalist
global execution_start_time
global index
iter_datalist=iter(datalist)
args_str = ','.join(iter_datalist)
cursor.execute('INSERT INTO '+data_table_name+' (as_ID,as_NM,as_at_ID,as_at_NM,VCF_ID,VARIANT_ID,as_at_LINE_SEQ,as_at_VAL_SEQ,as_at_VA_NM,as_at_VALUE) VALUES '+args_str)
conn.commit()
datalist=[]
time_now = datetime.datetime.now()
time_difference = time_now - execution_start_time
print('Total inserted records:'+str(index)+' in time '+str(time_difference))
# =============================================================================
# function defination / switch case for get records variable name & variable value
# =============================================================================
def get_header_data(header):
switcher = {
"CHROM" : {'variable_name':'CHROM', 'variable_value': record.CHROM},
"POS" : {'variable_name':'POS', 'variable_value': record.POS},
"ID" : {'variable_name':'ID', 'variable_value': record.ID},
"REF" : {'variable_name':'REF', 'variable_value': record.REF},
"ALT" : {'variable_name':'ALT', 'variable_value': record.ALT},
"QUAL" : {'variable_name':'QUAL', 'variable_value': record.QUAL},
"FILTER" : {'variable_name':'FILTER', 'variable_value': record.FILTER},
"INFO" : {'variable_name':'INFO', 'variable_value': record.INFO},
"FORMAT" : {'variable_name':'FORMAT', 'variable_value': record.FORMAT}
}
return switcher.get(header, "Invalid header")
dic={}
def insert_infos_metadata(reader_infos,line_index,file_name,as_at_nm,variable_data_name):
for info_data in reader_infos:
va_desc = reader_infos[info_data].desc
arr_len = len(va_desc.split(':'))
if arr_len > 1 :
last_str = va_desc.split(':')[arr_len-1]
dic[info_data]=last_str
return line_index
# =============================================================================
# global variable declaration
# =============================================================================
index = 0
variableList = []
datalist = []
headers_arr = []
totalrecordinserted=0
curentRecord=-1
execution_start_time= datetime.datetime.now() # calculate execution start time
# =============================================================================
# get file path or name from user
# =============================================================================
file_path = "gnomad.vcf"
if os.path.isfile(file_path) : #Check file is exists or not on given path
if True:
conn = psycopg2.connect(host="localhost",database="mydb", user="postgres", password="pgAdmin")
cursor = conn.cursor()
data_table_name = "data"
variable_data_name = "vdata"
create_table("data","1",conn)
create_table("vdata","2",conn)
vcf_reader = vcf.Reader(open(file_path, 'r'))
column_headers = vcf_reader._column_headers
line_index = 0
null_date = None
as_at_header = 'Header'
as_at_variant = 'Variant'
reader_infos = vcf_reader.infos
line_index = insert_infos_metadata(reader_infos,line_index,file_path,as_at_header,variable_data_name)
execution_start_time= datetime.datetime.now()
for record in vcf_reader:
index += 1
line_index += 1
if index == 5000:
break
Variant_id = ''
sample_name = ''
#for header in column_headers :
for header in column_headers :
if header != 'FORMAT' :
header_data_dict = get_header_data(header)
#print(header_data_dict)
header_data_dict_value = header_data_dict['variable_value']
#print(header_data_dict_value)
if isinstance(header_data_dict_value, str) :
create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, header, str(header_data_dict_value), null_date, null_date, 1, str(line_index),data_table_name)
elif isinstance(header_data_dict_value, list) :
if len(header_data_dict_value) == 0 and header == 'FILTER' :
create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, header, 'PASS', null_date, null_date, 1, str(line_index),data_table_name)
else :
i = 0
while i < len(header_data_dict_value) :
create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, header, str(header_data_dict_value[i]), null_date, null_date, str(i), str(line_index),data_table_name)
i += 1
elif isinstance(header_data_dict_value, dict) :
for dict_val_record in header_data_dict_value :
#print(dict_val_record,'.......',header_data_dict_value[dict_val_record])
variable_name = header + '/' + dict_val_record
variable_value = header_data_dict_value[dict_val_record]
#print('.........',variable_value)
variable_seq = 1
if isinstance(variable_value,list) :
for value in variable_value :
if dic.has_key(dict_val_record) :
header_key_description = vcf_reader.infos[dict_val_record].desc
arr_len = len(header_key_description.split(':'))
last_str = header_key_description.split(':')[arr_len-1]
array_obj_desc_headers = last_str.split('|')
arr_obj_val = value.strip("()[]").split("|")
vararray = dic[dict_val_record].split("|")
arr_index = 0
for desc_header_value in arr_obj_val :
variable_name = header+'/'+dict_val_record+'/'+vararray[arr_index].lstrip().replace("'","")
variable_value = arr_obj_val[arr_index]
arr_index += 1
create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, str(variable_seq), str(line_index),data_table_name)
else :
create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, value, null_date, null_date, str(variable_seq), str(line_index),data_table_name)
variable_seq += 1
else :
create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, str(variable_seq), str(line_index),data_table_name)
else :
variable_name = header
variable_value = str(header_data_dict_value)
create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, 1, str(line_index),data_table_name)
else :
format_data_arr = (record.FORMAT).split(":")
for format_data_record in format_data_arr :
variable_name = header+'/'+format_data_record
variable_value = record.genotype(record_data.sample)[format_data_record]
create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, 1,str(line_index),data_table_name)
insertdata(data_table_name,"1")
print('Variants inserted successfully.')
else:
print('Incorrect Choice.')
else:
print('File not present.')
Here is the input VCF file which we downloaded from gnomAD:
##fileformat=VCFv4.2
##hailversion=0.2.7-c860755b5da3
##INFO=<ID=vep,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|MINIMISED|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|GMAF|AFR_MAF|AMR_MAF|EAS_MAF|EUR_MAF|SAS_MAF|AA_MAF|EA_MAF|ExAC_MAF|ExAC_Adj_MAF|ExAC_AFR_MAF|ExAC_AMR_MAF|ExAC_EAS_MAF|ExAC_FIN_MAF|ExAC_NFE_MAF|ExAC_OTH_MAF|ExAC_SAS_MAF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF|LoF_filter|LoF_flags|LoF_info">
#CHROM POS ID REF ALT QUAL FILTER INFO
22 16050036 rs374742143 A C 442156.34 RF AC=80;AN=180;AF=4.44444e-01;rf_tp_probability=1.98655e-01;FS=7.05600e+00;InbreedingCoeff=3.82000e-01;MQ=2.71500e+01;MQRankSum=-1.02600e+00;QD=2.47500e+01;ReadPosRankSum=-2.11000e-01;SOR=1.26750e+01;BaseQRankSum=7.36000e-01;ClippingRankSum=2.96000e-01;DP=50165;VQSLOD=-9.58600e+02;VQSR_culprit=MQ;segdup;rf_negative_label;rf_label=FP;rf_train;variant_type=snv;allele_type=snv;n_alt_alleles=1;pab_max=1.00000e+00;gq_hist_alt_bin_freq=15|1348|318|278|171|32|20|31|51|152|82|5|9|13|35|44|8|4|2|41;gq_hist_all_bin_freq=2396|2885|745|685|311|58|38|37|54|152|82|5|9|13|35|44|8|4|2|41;dp_hist_alt_bin_freq=1872|720|58|9|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=13607|1974|114|13|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|2|6|26|42|12|99|3|137|31|63|99|21|66|37|9|1|7;AC_nfe_seu=0;AN_nfe_seu=4;AF_nfe_seu=0.00000e+00;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=4;controls_AF_afr_male=0.00000e+00;controls_nhomalt_afr_male=0;non_topmed_AC_amr=6;non_topmed_AN_amr=8;non_topmed_AF_amr=7.50000e-01;non_topmed_nhomalt_amr=2;AC_raw=4657;AN_raw=15208;AF_raw=3.06220e-01;nhomalt_raw=1998;AC_fin_female=27;AN_fin_female=44;AF_fin_female=6.13636e-01;nhomalt_fin_female=5;non_neuro_AC_asj_female=1;non_neuro_AN_asj_female=2;non_neuro_AF_asj_female=5.00000e-01;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=1;non_neuro_AN_afr_male=8;non_neuro_AF_afr_male=1.25000e-01;non_neuro_nhomalt_afr_male=0;AC_afr_male=2;AN_afr_male=10;AF_afr_male=2.00000e-01;nhomalt_afr_male=0;AC_afr=2;AN_afr=14;AF_afr=1.42857e-01;nhomalt_afr=0;non_neuro_AC_afr_female=0;non_neuro_AN_afr_female=2;non_neuro_AF_afr_female=0.00000e+00;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=3;non_topmed_AN_amr_female=4;non_topmed_AF_amr_female=7.50000e-01;non_topmed_nhomalt_amr_female=1;non_topmed_AC_oth_female=3;non_topmed_AN_oth_female=6;non_topmed_AF_oth_female=5.00000e-01;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=0;nhomalt_eas_female=0;AC_afr_female=0;AN_afr_female=4;AF_afr_female=0.00000e+00;nhomalt_afr_female=0;non_neuro_AC_female=16;non_neuro_AN_female=44;non_neuro_AF_female=3.63636e-01;non_neuro_nhomalt_female=2;controls_AC_afr=0;controls_AN_afr=4;controls_AF_afr=0.00000e+00;controls_nhomalt_afr=0;AC_nfe_onf=2;AN_nfe_onf=12;AF_nfe_onf=1.66667e-01;nhomalt_nfe_onf=0;controls_AC_fin_male=3;controls_AN_fin_male=10;controls_AF_fin_male=3.00000e-01;controls_nhomalt_fin_male=1;non_neuro_AC_nfe_nwe=13;non_neuro_AN_nfe_nwe=36;non_neuro_AF_nfe_nwe=3.61111e-01;non_neuro_nhomalt_nfe_nwe=4;AC_fin_male=18;AN_fin_male=34;AF_fin_male=5.29412e-01;nhomalt_fin_male=5;AC_nfe_female=5;AN_nfe_female=28;AF_nfe_female=1.78571e-01;nhomalt_nfe_female=0;AC_amr=6;AN_amr=8;AF_amr=7.50000e-01;nhomalt_amr=2;non_topmed_AC_nfe_male=12;non_topmed_AN_nfe_male=28;non_topmed_AF_nfe_male=4.28571e-01;non_topmed_nhomalt_nfe_male=4;AC_eas=2;AN_eas=2;AF_eas=1.00000e+00;nhomalt_eas=1;nhomalt=18;non_neuro_AC_nfe_female=5;non_neuro_AN_nfe_female=24;non_neuro_AF_nfe_female=2.08333e-01;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=1;non_neuro_AN_afr=10;non_neuro_AF_afr=1.00000e-01;non_neuro_nhomalt_afr=0;controls_AC_raw=1798;controls_AN_raw=5254;controls_AF_raw=3.42215e-01;controls_nhomalt_raw=765;controls_AC_male=18;controls_AN_male=38;controls_AF_male=4.73684e-01;controls_nhomalt_male=6;non_topmed_AC_male=41;non_topmed_AN_male=86;non_topmed_AF_male=4.76744e-01;non_topmed_nhomalt_male=12;controls_AC_nfe_female=5;controls_AN_nfe_female=16;controls_AF_nfe_female=3.12500e-01;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=3;non_neuro_AN_amr=4;non_neuro_AF_amr=7.50000e-01;non_neuro_nhomalt_amr=1;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=12;controls_AN_nfe_male=18;controls_AF_nfe_male=6.66667e-01;controls_nhomalt_nfe_male=4;non_neuro_AC_fin=8;non_neuro_AN_fin=18;non_neuro_AF_fin=4.44444e-01;non_neuro_nhomalt_fin=2;AC_oth_female=3;AN_oth_female=6;AF_oth_female=5.00000e-01;nhomalt_oth_female=0;controls_AC_nfe=17;controls_AN_nfe=34;controls_AF_nfe=5.00000e-01;controls_nhomalt_nfe=4;controls_AC_oth_female=2;controls_AN_oth_female=4;controls_AF_oth_female=5.00000e-01;controls_nhomalt_oth_female=0;controls_AC_asj=1;controls_AN_asj=2;controls_AF_asj=5.00000e-01;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=13;controls_AN_nfe_nwe=22;controls_AF_nfe_nwe=5.90909e-01;controls_nhomalt_nfe_nwe=4;AC_nfe_nwe=13;AN_nfe_nwe=42;AF_nfe_nwe=3.09524e-01;nhomalt_nfe_nwe=4;controls_AC_nfe_seu=0;controls_AN_nfe_seu=0;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=3;non_neuro_AN_amr_female=4;non_neuro_AF_amr_female=7.50000e-01;non_neuro_nhomalt_amr_female=1;non_neuro_AC_nfe_onf=2;non_neuro_AN_nfe_onf=8;non_neuro_AF_nfe_onf=2.50000e-01;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=2;non_topmed_AN_eas_male=2;non_topmed_AF_eas_male=1.00000e+00;non_topmed_nhomalt_eas_male=1;controls_AC_amr_female=3;controls_AN_amr_female=4;controls_AF_amr_female=7.50000e-01;controls_nhomalt_amr_female=1;non_neuro_AC_fin_male=3;non_neuro_AN_fin_male=10;non_neuro_AF_fin_male=3.00000e-01;non_neuro_nhomalt_fin_male=1;AC_female=39;AN_female=88;AF_female=4.43182e-01;nhomalt_female=6;non_neuro_AC_oth_male=1;non_neuro_AN_oth_male=4;non_neuro_AF_oth_male=2.50000e-01;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=2;non_topmed_AN_nfe_est=4;non_topmed_AF_nfe_est=5.00000e-01;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=13;non_topmed_AN_nfe_nwe=34;non_topmed_AF_nfe_nwe=3.82353e-01;non_topmed_nhomalt_nfe_nwe=4;non_topmed_AC_amr_male=3;non_topmed_AN_amr_male=4;non_topmed_AF_amr_male=7.50000e-01;non_topmed_nhomalt_amr_male=1;non_topmed_AC_nfe_onf=2;non_topmed_AN_nfe_onf=12;non_topmed_AF_nfe_onf=1.66667e-01;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=2;controls_AN_eas_male=2;controls_AF_eas_male=1.00000e+00;controls_nhomalt_eas_male=1;controls_AC_oth_male=1;controls_AN_oth_male=4;controls_AF_oth_male=2.50000e-01;controls_nhomalt_oth_male=0;non_topmed_AC=80;non_topmed_AN=170;non_topmed_AF=4.70588e-01;non_topmed_nhomalt=18;controls_AC_fin=8;controls_AN_fin=18;controls_AF_fin=4.44444e-01;controls_nhomalt_fin=2;non_neuro_AC_nfe=17;non_neuro_AN_nfe=48;non_neuro_AF_nfe=3.54167e-01;non_neuro_nhomalt_nfe=4;non_neuro_AC_fin_female=5;non_neuro_AN_fin_female=8;non_neuro_AF_fin_female=6.25000e-01;non_neuro_nhomalt_fin_female=1;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=4;non_topmed_AF_nfe_seu=0.00000e+00;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=1;non_topmed_AN_asj=2;non_topmed_AF_asj=5.00000e-01;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=2;controls_AN_nfe_onf=8;controls_AF_nfe_onf=2.50000e-01;controls_nhomalt_nfe_onf=0;non_neuro_AC=35;non_neuro_AN=92;non_neuro_AF=3.80435e-01;non_neuro_nhomalt=8;non_topmed_AC_nfe=17;non_topmed_AN_nfe=54;non_topmed_AF_nfe=3.14815e-01;non_topmed_nhomalt_nfe=4;non_topmed_AC_raw=3996;non_topmed_AN_raw=12728;non_topmed_AF_raw=3.13953e-01;non_topmed_nhomalt_raw=1721;non_neuro_AC_nfe_est=2;non_neuro_AN_nfe_est=4;non_neuro_AF_nfe_est=5.00000e-01;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=4;non_topmed_AN_oth_male=8;non_topmed_AF_oth_male=5.00000e-01;non_topmed_nhomalt_oth_male=1;AC_nfe_est=2;AN_nfe_est=4;AF_nfe_est=5.00000e-01;nhomalt_nfe_est=0;non_topmed_AC_afr_male=2;non_topmed_AN_afr_male=10;non_topmed_AF_afr_male=2.00000e-01;non_topmed_nhomalt_afr_male=0;AC_eas_male=2;AN_eas_male=2;AF_eas_male=1.00000e+00;nhomalt_eas_male=1;controls_AC_eas=2;controls_AN_eas=2;controls_AF_eas=1.00000e+00;controls_nhomalt_eas=1;non_neuro_AC_eas_male=2;non_neuro_AN_eas_male=2;non_neuro_AF_eas_male=1.00000e+00;non_neuro_nhomalt_eas_male=1;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=0;non_neuro_nhomalt_asj_male=0;controls_AC_oth=3;controls_AN_oth=8;controls_AF_oth=3.75000e-01;controls_nhomalt_oth=0;AC_nfe=17;AN_nfe=62;AF_nfe=2.74194e-01;nhomalt_nfe=4;non_topmed_AC_female=39;non_topmed_AN_female=84;non_topmed_AF_female=4.64286e-01;non_topmed_nhomalt_female=6;non_neuro_AC_asj=1;non_neuro_AN_asj=2;non_neuro_AF_asj=5.00000e-01;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=3103;non_neuro_AN_raw=10138;non_neuro_AF_raw=3.06076e-01;non_neuro_nhomalt_raw=1328;non_topmed_AC_eas=2;non_topmed_AN_eas=2;non_topmed_AF_eas=1.00000e+00;non_topmed_nhomalt_eas=1;non_topmed_AC_fin_male=18;non_topmed_AN_fin_male=34;non_topmed_AF_fin_male=5.29412e-01;non_topmed_nhomalt_fin_male=5;AC_fin=45;AN_fin=78;AF_fin=5.76923e-01;nhomalt_fin=10;AC_nfe_male=12;AN_nfe_male=34;AF_nfe_male=3.52941e-01;nhomalt_nfe_male=4;controls_AC_amr_male=0;controls_AN_amr_male=0;controls_nhomalt_amr_male=0;controls_AC_afr_female=0;controls_AN_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=3;controls_AN_amr=4;controls_AF_amr=7.50000e-01;controls_nhomalt_amr=1;AC_asj_female=1;AN_asj_female=2;AF_asj_female=5.00000e-01;nhomalt_asj_female=0;non_neuro_AC_eas=2;non_neuro_AN_eas=2;non_neuro_AF_eas=1.00000e+00;non_neuro_nhomalt_eas=1;non_neuro_AC_male=19;non_neuro_AN_male=48;non_neuro_AF_male=3.95833e-01;non_neuro_nhomalt_male=6;AC_asj=1;AN_asj=2;AF_asj=5.00000e-01;nhomalt_asj=0;controls_AC_nfe_est=2;controls_AN_nfe_est=4;controls_AF_nfe_est=5.00000e-01;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=1;non_topmed_AN_asj_female=2;non_topmed_AF_asj_female=5.00000e-01;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=7;non_topmed_AN_oth=14;non_topmed_AF_oth=5.00000e-01;non_topmed_nhomalt_oth=1;non_topmed_AC_fin_female=27;non_topmed_AN_fin_female=44;non_topmed_AF_fin_female=6.13636e-01;non_topmed_nhomalt_fin_female=5;AC_oth=7;AN_oth=14;AF_oth=5.00000e-01;nhomalt_oth=1;non_neuro_AC_nfe_male=12;non_neuro_AN_nfe_male=24;non_neuro_AF_nfe_male=5.00000e-01;non_neuro_nhomalt_nfe_male=4;controls_AC_female=16;controls_AN_female=34;controls_AF_female=4.70588e-01;controls_nhomalt_female=2;non_topmed_AC_fin=45;non_topmed_AN_fin=78;non_topmed_AF_fin=5.76923e-01;non_topmed_nhomalt_fin=10;non_topmed_AC_nfe_female=5;non_topmed_AN_nfe_female=26;non_topmed_AF_nfe_female=1.92308e-01;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=0;controls_AN_asj_male=0;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=3;non_neuro_AN_oth=8;non_neuro_AF_oth=3.75000e-01;non_neuro_nhomalt_oth=0;AC_male=41;AN_male=92;AF_male=4.45652e-01;nhomalt_male=12;controls_AC_fin_female=5;controls_AN_fin_female=8;controls_AF_fin_female=6.25000e-01;controls_nhomalt_fin_female=1;controls_AC_asj_female=1;controls_AN_asj_female=2;controls_AF_asj_female=5.00000e-01;controls_nhomalt_asj_female=0;AC_amr_male=3;AN_amr_male=4;AF_amr_male=7.50000e-01;nhomalt_amr_male=1;AC_amr_female=3;AN_amr_female=4;AF_amr_female=7.50000e-01;nhomalt_amr_female=1;AC_oth_male=4;AN_oth_male=8;AF_oth_male=5.00000e-01;nhomalt_oth_male=1;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=0;non_topmed_AN_afr_female=2;non_topmed_AF_afr_female=0.00000e+00;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=2;non_topmed_AN_afr=12;non_topmed_AF_afr=1.66667e-01;non_topmed_nhomalt_afr=0;controls_AC=34;controls_AN=72;controls_AF=4.72222e-01;controls_nhomalt=8;non_neuro_AC_oth_female=2;non_neuro_AN_oth_female=4;non_neuro_AF_oth_female=5.00000e-01;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=3.26626e-01;non_topmed_faf99_amr=3.26626e-01;faf95_afr=2.53825e-02;faf99_afr=2.53821e-02;controls_faf95_afr=0.00000e+00;controls_faf99_afr=0.00000e+00;faf95_amr=3.26626e-01;faf99_amr=3.26626e-01;faf95_eas=1.77680e-01;faf99_eas=1.77680e-01;faf95=3.65988e-01;faf99=3.65988e-01;non_neuro_faf95_afr=5.12900e-03;non_neuro_faf99_afr=5.12900e-03;non_neuro_faf95_amr=2.04422e-01;non_neuro_faf99_amr=2.04423e-01;controls_faf95_nfe=3.18592e-01;controls_faf99_nfe=3.18591e-01;non_topmed_faf95=3.87517e-01;non_topmed_faf99=3.87517e-01;non_neuro_faf95_nfe=2.25669e-01;non_neuro_faf99_nfe=2.25670e-01;non_neuro_faf95=2.81191e-01;non_neuro_faf99=2.81191e-01;non_topmed_faf95_nfe=2.00594e-01;non_topmed_faf99_nfe=2.00595e-01;controls_faf95_eas=1.77680e-01;controls_faf99_eas=1.77680e-01;faf95_nfe=1.74711e-01;faf99_nfe=1.74712e-01;non_topmed_faf95_eas=1.77680e-01;non_topmed_faf99_eas=1.77680e-01;controls_faf95_amr=2.04422e-01;controls_faf99_amr=2.04423e-01;non_neuro_faf95_eas=1.77680e-01;non_neuro_faf99_eas=1.77680e-01;non_topmed_faf95_afr=2.96130e-02;non_topmed_faf99_afr=2.96133e-02;controls_faf95=3.47362e-01;controls_faf99=3.47362e-01;controls_popmax=eas;controls_AC_popmax=2;controls_AN_popmax=2;controls_AF_popmax=1.00000e+00;controls_nhomalt_popmax=1;popmax=eas;AC_popmax=2;AN_popmax=2;AF_popmax=1.00000e+00;nhomalt_popmax=1;age_hist_het_bin_freq=2|1|1|1|0|2|0|2|1|0;age_hist_het_n_smaller=4;age_hist_het_n_larger=0;age_hist_hom_bin_freq=3|0|0|0|0|0|2|0|1|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_neuro_popmax=eas;non_neuro_AC_popmax=2;non_neuro_AN_popmax=2;non_neuro_AF_popmax=1.00000e+00;non_neuro_nhomalt_popmax=1;non_topmed_popmax=eas;non_topmed_AC_popmax=2;non_topmed_AN_popmax=2;non_topmed_AF_popmax=1.00000e+00;non_topmed_nhomalt_popmax=1;vep=C|intergenic_variant|MODIFIER|||||||||||||||rs374742143|1||||SNV|1||||||||||||||||||||||||||||||||||||||||||||
My output is in Postgres table:
as_id as_nm as_at_id as_at_nm vcf_id variant_id as_at_line_seq date_start date_end as_at_val_seq as_at_va_nm as_at_value
1 VCF 1 Variant "" 1 1 1 CHROM 22
1 VCF 1 Variant "" 1 1 1 POS 16050036
1 VCF 1 Variant "" 1 1 1 ID rs374742143
1 VCF 1 Variant "" 1 1 1 REF A
1 VCF 1 Variant "" 1 1 1 ALT C
1 VCF 1 Variant "" 1 1 1 QUAL 442156.34
1 VCF 1 Variant "" 1 1 1 FILTER RF
1 VCF 1 Variant "" 1 1 1 INFO/non_neuro_nhomalt_amr_male 0
1 VCF 1 Variant "" 1 1 1 INFO/controls_AN_afr 4
1 VCF 1 Variant "" 1 1 1 INFO/controls_AC_amr_female 3
1 VCF 1 Variant "" 1 1 1 INFO/AF_oth_female 0.5
1 VCF 1 Variant "" 1 1 1 INFO/non_topmed_nhomalt_afr_female 0
My script is taking ~60 minutes to process 100K lines in my source VCF file. The most time is being spent on inserting in Postgres.
How to optimize the insertion logic to speed this up?
Upvotes: 0
Views: 947
Reputation: 481
Try using this method with sqlalchemy, it will insert multiple rows (1000) into a single insert statement, making the insert process much faster for large dataset. In my case it went from 4 days into 20 minutes to insert 12 million rows.
import pandas as pd
from sqlalchemy import create_engine
somethinghuge = ...
engine = create_engine(f'postgresql+psycopg2://{USER_NAME}:{PASS}@{HOST}/{DATABASE}')
df_dataframe = pd.DataFrame(somethinghuge)
df_dataframe.to_sql('table_name', engine, index=False, schema='schema_name',
if_exists='replace', method='multi', chunksize=1000)
The method='multi' and chuksize is what does the magic.
Instead of inserting into postgres one line at the time it will write an insert with 1000 rows.
check the documentation for more details about method: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html
Upvotes: 0
Reputation: 1313
I think you can improve the speed 10~100times if switch to pandas to sql method with special postgresql method. I use to compare the normal insert vs pandas postgresql copy method. the speed is 10~100times fast depends on your data and table design.
here is the sample code. 1. use sqlalchemy db engine as follow:
from sqlalchmey import create_engine
engine = create_engine('postgres://postgres:postgres@yourserver/dbname')
define Alternative to_sql() method for DBs that support COPY FROM. Pandas online Manual: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-sql
from io import StringIO
def psql_insert_copy(table, conn, keys, data_iter):
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ', '.join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = '{}.{}'.format(table.schema, table.name)
else:
table_name = table.name
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
transform your data for pandas Dataframe format. assuming the prepared data in df1
send the data to database via to_sql
df1.tosql('yourtablename',engine=engin,method=psql_insert_copy)
In my case, at least 10~20K rows per seconds insert speed can be reached in 2 core/16Gmem docker container.
Good luck. if you found the answer is useful, pls vote it up.
Upvotes: 0
Reputation: 1295
I know that in SQL it take some time to insert/update 100K lines or event performing queries on large not indexed tables.
I do not know VCF that well but try to consider following in order to speed up your process:
Upvotes: 1