dang
dang

Reputation: 2412

Slow python code while inserting to Postgres

I have the following Python code using PyVCF library. This script is taking long time to parse my VCF file.

Here is the script:

# =============================================================================
# import libraries for python 
# =============================================================================
import vcf
import os.path
import psycopg2       
import datetime
import sys

# =============================================================================
# function drops the previous table if exists and creates new table with dynamic table name
# =============================================================================
def create_table(table_name,table_type,conn_obj):
    dbcur = conn_obj.cursor()
    dbcur.execute("""DROP TABLE IF EXISTS {0};""".format(table_name))
    if table_type == '1' :
        dbcur.execute("""CREATE TABLE IF NOT EXISTS {0} (as_ID INT, as_NM TEXT, as_at_ID INT, as_at_NM TEXT, VCF_ID TEXT, VARIANT_ID TEXT, as_at_LINE_SEQ INT, DATE_START DATE, DATE_END DATE, as_at_VAL_SEQ INT, as_at_VA_NM TEXT, as_at_VALUE TEXT);""".format(table_name))
    elif table_type == '2' :
        dbcur.execute("""CREATE TABLE IF NOT EXISTS {0} (as_at_VA_NM TEXT, as_at_VA_DESC TEXT,as_at_DATA_TYPE TEXT);""".format(table_name))     
    conn_obj.commit()
    message = 'Table '+table_name+' created successfully.\n'
    print(message)
    return table_name


# =============================================================================
# function for insert vcf files data in values table
# =============================================================================

def create_tuples(as_id, vcf_id, Variant_id, as_at_nm, sample_id, as_at_va_nm, as_at_value, date_start, date_end, as_va_seq, as_at_va_line_seq,data_table_name):
    as_at_id = '1'
    sample_id = '1'
    variant_id = '1'
    as_nm = 'VCF'
    datalist.append("({0},'{1}','{2}','{3}','{4}',{5},'{6}','{7}','{8}','{9}')".format(as_id,str(as_nm),as_at_id,as_at_nm,"",variant_id,as_at_va_line_seq,as_va_seq,as_at_va_nm,as_at_value))

    if len(datalist)==200000:
        insertdata(data_table_name,as_id)

def insertdata(data_table_name,as_id):
    global datalist
    global execution_start_time
    global index

    iter_datalist=iter(datalist) 
    args_str = ','.join(iter_datalist)
    cursor.execute('INSERT INTO '+data_table_name+' (as_ID,as_NM,as_at_ID,as_at_NM,VCF_ID,VARIANT_ID,as_at_LINE_SEQ,as_at_VAL_SEQ,as_at_VA_NM,as_at_VALUE) VALUES '+args_str) 
    conn.commit()
    datalist=[]
    time_now = datetime.datetime.now()
    time_difference = time_now - execution_start_time
    print('Total inserted records:'+str(index)+' in time '+str(time_difference))


# =============================================================================
# function defination / switch case for get records variable name & variable value
# =============================================================================
def get_header_data(header):
    switcher = {
        "CHROM" : {'variable_name':'CHROM', 'variable_value': record.CHROM},
        "POS" : {'variable_name':'POS', 'variable_value': record.POS},
        "ID" : {'variable_name':'ID', 'variable_value': record.ID},
        "REF" : {'variable_name':'REF', 'variable_value': record.REF},
        "ALT" : {'variable_name':'ALT', 'variable_value': record.ALT},
        "QUAL" : {'variable_name':'QUAL', 'variable_value': record.QUAL},
        "FILTER" : {'variable_name':'FILTER', 'variable_value': record.FILTER},
        "INFO" : {'variable_name':'INFO', 'variable_value': record.INFO},
        "FORMAT" : {'variable_name':'FORMAT', 'variable_value': record.FORMAT}
    } 
    return switcher.get(header, "Invalid header")   
dic={}
def insert_infos_metadata(reader_infos,line_index,file_name,as_at_nm,variable_data_name):
    for info_data in reader_infos:
        va_desc = reader_infos[info_data].desc
        arr_len = len(va_desc.split(':'))
        if arr_len > 1 :
            last_str = va_desc.split(':')[arr_len-1]
            dic[info_data]=last_str
    return line_index



# =============================================================================
# global variable declaration
# =============================================================================
index = 0
variableList = []
datalist  = []
headers_arr = []
totalrecordinserted=0
curentRecord=-1
execution_start_time= datetime.datetime.now()                                       # calculate execution start time

# =============================================================================
# get file path or name from user
# =============================================================================

file_path = "gnomad.vcf"
if os.path.isfile(file_path) :                                                      #Check file is exists or not on given path
    if True:
        conn = psycopg2.connect(host="localhost",database="mydb", user="postgres", password="pgAdmin")
        cursor = conn.cursor()
        data_table_name = "data"
        variable_data_name = "vdata"
        create_table("data","1",conn)
        create_table("vdata","2",conn)

        vcf_reader = vcf.Reader(open(file_path, 'r'))
        column_headers = vcf_reader._column_headers
        line_index = 0
        null_date = None
        as_at_header = 'Header'
        as_at_variant = 'Variant' 
        reader_infos = vcf_reader.infos
        line_index = insert_infos_metadata(reader_infos,line_index,file_path,as_at_header,variable_data_name)

        execution_start_time= datetime.datetime.now()  

        for record in vcf_reader:
            index += 1
            line_index += 1
            if index == 5000:
                break
            Variant_id = ''
            sample_name = ''
            #for header in column_headers : 
            for header in column_headers :
                if header != 'FORMAT' :
                    header_data_dict = get_header_data(header)
                    #print(header_data_dict)
                    header_data_dict_value = header_data_dict['variable_value']
                    #print(header_data_dict_value)
                    if isinstance(header_data_dict_value, str) :
                        create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, header, str(header_data_dict_value), null_date, null_date, 1, str(line_index),data_table_name)
                    elif isinstance(header_data_dict_value, list) :
                        if len(header_data_dict_value) == 0 and header == 'FILTER' :
                            create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, header, 'PASS', null_date, null_date, 1, str(line_index),data_table_name)
                        else :
                            i = 0
                            while i < len(header_data_dict_value) :
                                create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, header, str(header_data_dict_value[i]), null_date, null_date, str(i), str(line_index),data_table_name)
                                i += 1
                    elif isinstance(header_data_dict_value, dict) :
                        for dict_val_record in header_data_dict_value :
                            #print(dict_val_record,'.......',header_data_dict_value[dict_val_record])
                            variable_name = header + '/' + dict_val_record
                            variable_value = header_data_dict_value[dict_val_record]
                            #print('.........',variable_value)
                            variable_seq = 1 
                            if isinstance(variable_value,list) :
                                for value in variable_value :
                                    if dic.has_key(dict_val_record) :  
                                        header_key_description = vcf_reader.infos[dict_val_record].desc
                                        arr_len = len(header_key_description.split(':'))
                                        last_str = header_key_description.split(':')[arr_len-1]
                                        array_obj_desc_headers = last_str.split('|')
                                        arr_obj_val = value.strip("()[]").split("|")
                                        vararray = dic[dict_val_record].split("|")
                                        arr_index = 0
                                        for desc_header_value in arr_obj_val :
                                            variable_name = header+'/'+dict_val_record+'/'+vararray[arr_index].lstrip().replace("'","")
                                            variable_value = arr_obj_val[arr_index]
                                            arr_index += 1
                                            create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, str(variable_seq), str(line_index),data_table_name)
                                    else :
                                          create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, value, null_date, null_date, str(variable_seq), str(line_index),data_table_name)

                                    variable_seq += 1  
                            else :
                                create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, str(variable_seq), str(line_index),data_table_name)
                    else :
                        variable_name = header
                        variable_value = str(header_data_dict_value)
                        create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, 1, str(line_index),data_table_name)
                else :
                    format_data_arr = (record.FORMAT).split(":")
                    for format_data_record in format_data_arr :
                        variable_name = header+'/'+format_data_record
                        variable_value = record.genotype(record_data.sample)[format_data_record]
                        create_tuples("1", file_path, Variant_id, as_at_variant, sample_name, variable_name, variable_value, null_date, null_date, 1,str(line_index),data_table_name)

        insertdata(data_table_name,"1")
        print('Variants inserted successfully.')
    else:
        print('Incorrect Choice.')
else:
    print('File not present.')

Here is the input VCF file which we downloaded from gnomAD:

##fileformat=VCFv4.2
##hailversion=0.2.7-c860755b5da3
##INFO=<ID=vep,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|MINIMISED|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|GMAF|AFR_MAF|AMR_MAF|EAS_MAF|EUR_MAF|SAS_MAF|AA_MAF|EA_MAF|ExAC_MAF|ExAC_Adj_MAF|ExAC_AFR_MAF|ExAC_AMR_MAF|ExAC_EAS_MAF|ExAC_FIN_MAF|ExAC_NFE_MAF|ExAC_OTH_MAF|ExAC_SAS_MAF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF|LoF_filter|LoF_flags|LoF_info">
#CHROM  POS ID  REF ALT QUAL    FILTER  INFO
22  16050036    rs374742143 A   C   442156.34   RF  AC=80;AN=180;AF=4.44444e-01;rf_tp_probability=1.98655e-01;FS=7.05600e+00;InbreedingCoeff=3.82000e-01;MQ=2.71500e+01;MQRankSum=-1.02600e+00;QD=2.47500e+01;ReadPosRankSum=-2.11000e-01;SOR=1.26750e+01;BaseQRankSum=7.36000e-01;ClippingRankSum=2.96000e-01;DP=50165;VQSLOD=-9.58600e+02;VQSR_culprit=MQ;segdup;rf_negative_label;rf_label=FP;rf_train;variant_type=snv;allele_type=snv;n_alt_alleles=1;pab_max=1.00000e+00;gq_hist_alt_bin_freq=15|1348|318|278|171|32|20|31|51|152|82|5|9|13|35|44|8|4|2|41;gq_hist_all_bin_freq=2396|2885|745|685|311|58|38|37|54|152|82|5|9|13|35|44|8|4|2|41;dp_hist_alt_bin_freq=1872|720|58|9|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_alt_n_larger=0;dp_hist_all_bin_freq=13607|1974|114|13|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;dp_hist_all_n_larger=0;ab_hist_alt_bin_freq=0|0|0|2|6|26|42|12|99|3|137|31|63|99|21|66|37|9|1|7;AC_nfe_seu=0;AN_nfe_seu=4;AF_nfe_seu=0.00000e+00;nhomalt_nfe_seu=0;controls_AC_afr_male=0;controls_AN_afr_male=4;controls_AF_afr_male=0.00000e+00;controls_nhomalt_afr_male=0;non_topmed_AC_amr=6;non_topmed_AN_amr=8;non_topmed_AF_amr=7.50000e-01;non_topmed_nhomalt_amr=2;AC_raw=4657;AN_raw=15208;AF_raw=3.06220e-01;nhomalt_raw=1998;AC_fin_female=27;AN_fin_female=44;AF_fin_female=6.13636e-01;nhomalt_fin_female=5;non_neuro_AC_asj_female=1;non_neuro_AN_asj_female=2;non_neuro_AF_asj_female=5.00000e-01;non_neuro_nhomalt_asj_female=0;non_neuro_AC_afr_male=1;non_neuro_AN_afr_male=8;non_neuro_AF_afr_male=1.25000e-01;non_neuro_nhomalt_afr_male=0;AC_afr_male=2;AN_afr_male=10;AF_afr_male=2.00000e-01;nhomalt_afr_male=0;AC_afr=2;AN_afr=14;AF_afr=1.42857e-01;nhomalt_afr=0;non_neuro_AC_afr_female=0;non_neuro_AN_afr_female=2;non_neuro_AF_afr_female=0.00000e+00;non_neuro_nhomalt_afr_female=0;non_topmed_AC_amr_female=3;non_topmed_AN_amr_female=4;non_topmed_AF_amr_female=7.50000e-01;non_topmed_nhomalt_amr_female=1;non_topmed_AC_oth_female=3;non_topmed_AN_oth_female=6;non_topmed_AF_oth_female=5.00000e-01;non_topmed_nhomalt_oth_female=0;AC_eas_female=0;AN_eas_female=0;nhomalt_eas_female=0;AC_afr_female=0;AN_afr_female=4;AF_afr_female=0.00000e+00;nhomalt_afr_female=0;non_neuro_AC_female=16;non_neuro_AN_female=44;non_neuro_AF_female=3.63636e-01;non_neuro_nhomalt_female=2;controls_AC_afr=0;controls_AN_afr=4;controls_AF_afr=0.00000e+00;controls_nhomalt_afr=0;AC_nfe_onf=2;AN_nfe_onf=12;AF_nfe_onf=1.66667e-01;nhomalt_nfe_onf=0;controls_AC_fin_male=3;controls_AN_fin_male=10;controls_AF_fin_male=3.00000e-01;controls_nhomalt_fin_male=1;non_neuro_AC_nfe_nwe=13;non_neuro_AN_nfe_nwe=36;non_neuro_AF_nfe_nwe=3.61111e-01;non_neuro_nhomalt_nfe_nwe=4;AC_fin_male=18;AN_fin_male=34;AF_fin_male=5.29412e-01;nhomalt_fin_male=5;AC_nfe_female=5;AN_nfe_female=28;AF_nfe_female=1.78571e-01;nhomalt_nfe_female=0;AC_amr=6;AN_amr=8;AF_amr=7.50000e-01;nhomalt_amr=2;non_topmed_AC_nfe_male=12;non_topmed_AN_nfe_male=28;non_topmed_AF_nfe_male=4.28571e-01;non_topmed_nhomalt_nfe_male=4;AC_eas=2;AN_eas=2;AF_eas=1.00000e+00;nhomalt_eas=1;nhomalt=18;non_neuro_AC_nfe_female=5;non_neuro_AN_nfe_female=24;non_neuro_AF_nfe_female=2.08333e-01;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=1;non_neuro_AN_afr=10;non_neuro_AF_afr=1.00000e-01;non_neuro_nhomalt_afr=0;controls_AC_raw=1798;controls_AN_raw=5254;controls_AF_raw=3.42215e-01;controls_nhomalt_raw=765;controls_AC_male=18;controls_AN_male=38;controls_AF_male=4.73684e-01;controls_nhomalt_male=6;non_topmed_AC_male=41;non_topmed_AN_male=86;non_topmed_AF_male=4.76744e-01;non_topmed_nhomalt_male=12;controls_AC_nfe_female=5;controls_AN_nfe_female=16;controls_AF_nfe_female=3.12500e-01;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=3;non_neuro_AN_amr=4;non_neuro_AF_amr=7.50000e-01;non_neuro_nhomalt_amr=1;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=0;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=0;nhomalt_asj_male=0;controls_AC_nfe_male=12;controls_AN_nfe_male=18;controls_AF_nfe_male=6.66667e-01;controls_nhomalt_nfe_male=4;non_neuro_AC_fin=8;non_neuro_AN_fin=18;non_neuro_AF_fin=4.44444e-01;non_neuro_nhomalt_fin=2;AC_oth_female=3;AN_oth_female=6;AF_oth_female=5.00000e-01;nhomalt_oth_female=0;controls_AC_nfe=17;controls_AN_nfe=34;controls_AF_nfe=5.00000e-01;controls_nhomalt_nfe=4;controls_AC_oth_female=2;controls_AN_oth_female=4;controls_AF_oth_female=5.00000e-01;controls_nhomalt_oth_female=0;controls_AC_asj=1;controls_AN_asj=2;controls_AF_asj=5.00000e-01;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=0;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=13;controls_AN_nfe_nwe=22;controls_AF_nfe_nwe=5.90909e-01;controls_nhomalt_nfe_nwe=4;AC_nfe_nwe=13;AN_nfe_nwe=42;AF_nfe_nwe=3.09524e-01;nhomalt_nfe_nwe=4;controls_AC_nfe_seu=0;controls_AN_nfe_seu=0;controls_nhomalt_nfe_seu=0;non_neuro_AC_amr_female=3;non_neuro_AN_amr_female=4;non_neuro_AF_amr_female=7.50000e-01;non_neuro_nhomalt_amr_female=1;non_neuro_AC_nfe_onf=2;non_neuro_AN_nfe_onf=8;non_neuro_AF_nfe_onf=2.50000e-01;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=2;non_topmed_AN_eas_male=2;non_topmed_AF_eas_male=1.00000e+00;non_topmed_nhomalt_eas_male=1;controls_AC_amr_female=3;controls_AN_amr_female=4;controls_AF_amr_female=7.50000e-01;controls_nhomalt_amr_female=1;non_neuro_AC_fin_male=3;non_neuro_AN_fin_male=10;non_neuro_AF_fin_male=3.00000e-01;non_neuro_nhomalt_fin_male=1;AC_female=39;AN_female=88;AF_female=4.43182e-01;nhomalt_female=6;non_neuro_AC_oth_male=1;non_neuro_AN_oth_male=4;non_neuro_AF_oth_male=2.50000e-01;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=2;non_topmed_AN_nfe_est=4;non_topmed_AF_nfe_est=5.00000e-01;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=13;non_topmed_AN_nfe_nwe=34;non_topmed_AF_nfe_nwe=3.82353e-01;non_topmed_nhomalt_nfe_nwe=4;non_topmed_AC_amr_male=3;non_topmed_AN_amr_male=4;non_topmed_AF_amr_male=7.50000e-01;non_topmed_nhomalt_amr_male=1;non_topmed_AC_nfe_onf=2;non_topmed_AN_nfe_onf=12;non_topmed_AF_nfe_onf=1.66667e-01;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_male=2;controls_AN_eas_male=2;controls_AF_eas_male=1.00000e+00;controls_nhomalt_eas_male=1;controls_AC_oth_male=1;controls_AN_oth_male=4;controls_AF_oth_male=2.50000e-01;controls_nhomalt_oth_male=0;non_topmed_AC=80;non_topmed_AN=170;non_topmed_AF=4.70588e-01;non_topmed_nhomalt=18;controls_AC_fin=8;controls_AN_fin=18;controls_AF_fin=4.44444e-01;controls_nhomalt_fin=2;non_neuro_AC_nfe=17;non_neuro_AN_nfe=48;non_neuro_AF_nfe=3.54167e-01;non_neuro_nhomalt_nfe=4;non_neuro_AC_fin_female=5;non_neuro_AN_fin_female=8;non_neuro_AF_fin_female=6.25000e-01;non_neuro_nhomalt_fin_female=1;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=4;non_topmed_AF_nfe_seu=0.00000e+00;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=0;controls_nhomalt_eas_female=0;non_topmed_AC_asj=1;non_topmed_AN_asj=2;non_topmed_AF_asj=5.00000e-01;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=2;controls_AN_nfe_onf=8;controls_AF_nfe_onf=2.50000e-01;controls_nhomalt_nfe_onf=0;non_neuro_AC=35;non_neuro_AN=92;non_neuro_AF=3.80435e-01;non_neuro_nhomalt=8;non_topmed_AC_nfe=17;non_topmed_AN_nfe=54;non_topmed_AF_nfe=3.14815e-01;non_topmed_nhomalt_nfe=4;non_topmed_AC_raw=3996;non_topmed_AN_raw=12728;non_topmed_AF_raw=3.13953e-01;non_topmed_nhomalt_raw=1721;non_neuro_AC_nfe_est=2;non_neuro_AN_nfe_est=4;non_neuro_AF_nfe_est=5.00000e-01;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=4;non_topmed_AN_oth_male=8;non_topmed_AF_oth_male=5.00000e-01;non_topmed_nhomalt_oth_male=1;AC_nfe_est=2;AN_nfe_est=4;AF_nfe_est=5.00000e-01;nhomalt_nfe_est=0;non_topmed_AC_afr_male=2;non_topmed_AN_afr_male=10;non_topmed_AF_afr_male=2.00000e-01;non_topmed_nhomalt_afr_male=0;AC_eas_male=2;AN_eas_male=2;AF_eas_male=1.00000e+00;nhomalt_eas_male=1;controls_AC_eas=2;controls_AN_eas=2;controls_AF_eas=1.00000e+00;controls_nhomalt_eas=1;non_neuro_AC_eas_male=2;non_neuro_AN_eas_male=2;non_neuro_AF_eas_male=1.00000e+00;non_neuro_nhomalt_eas_male=1;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=0;non_neuro_nhomalt_asj_male=0;controls_AC_oth=3;controls_AN_oth=8;controls_AF_oth=3.75000e-01;controls_nhomalt_oth=0;AC_nfe=17;AN_nfe=62;AF_nfe=2.74194e-01;nhomalt_nfe=4;non_topmed_AC_female=39;non_topmed_AN_female=84;non_topmed_AF_female=4.64286e-01;non_topmed_nhomalt_female=6;non_neuro_AC_asj=1;non_neuro_AN_asj=2;non_neuro_AF_asj=5.00000e-01;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=0;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=3103;non_neuro_AN_raw=10138;non_neuro_AF_raw=3.06076e-01;non_neuro_nhomalt_raw=1328;non_topmed_AC_eas=2;non_topmed_AN_eas=2;non_topmed_AF_eas=1.00000e+00;non_topmed_nhomalt_eas=1;non_topmed_AC_fin_male=18;non_topmed_AN_fin_male=34;non_topmed_AF_fin_male=5.29412e-01;non_topmed_nhomalt_fin_male=5;AC_fin=45;AN_fin=78;AF_fin=5.76923e-01;nhomalt_fin=10;AC_nfe_male=12;AN_nfe_male=34;AF_nfe_male=3.52941e-01;nhomalt_nfe_male=4;controls_AC_amr_male=0;controls_AN_amr_male=0;controls_nhomalt_amr_male=0;controls_AC_afr_female=0;controls_AN_afr_female=0;controls_nhomalt_afr_female=0;controls_AC_amr=3;controls_AN_amr=4;controls_AF_amr=7.50000e-01;controls_nhomalt_amr=1;AC_asj_female=1;AN_asj_female=2;AF_asj_female=5.00000e-01;nhomalt_asj_female=0;non_neuro_AC_eas=2;non_neuro_AN_eas=2;non_neuro_AF_eas=1.00000e+00;non_neuro_nhomalt_eas=1;non_neuro_AC_male=19;non_neuro_AN_male=48;non_neuro_AF_male=3.95833e-01;non_neuro_nhomalt_male=6;AC_asj=1;AN_asj=2;AF_asj=5.00000e-01;nhomalt_asj=0;controls_AC_nfe_est=2;controls_AN_nfe_est=4;controls_AF_nfe_est=5.00000e-01;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=1;non_topmed_AN_asj_female=2;non_topmed_AF_asj_female=5.00000e-01;non_topmed_nhomalt_asj_female=0;non_topmed_AC_oth=7;non_topmed_AN_oth=14;non_topmed_AF_oth=5.00000e-01;non_topmed_nhomalt_oth=1;non_topmed_AC_fin_female=27;non_topmed_AN_fin_female=44;non_topmed_AF_fin_female=6.13636e-01;non_topmed_nhomalt_fin_female=5;AC_oth=7;AN_oth=14;AF_oth=5.00000e-01;nhomalt_oth=1;non_neuro_AC_nfe_male=12;non_neuro_AN_nfe_male=24;non_neuro_AF_nfe_male=5.00000e-01;non_neuro_nhomalt_nfe_male=4;controls_AC_female=16;controls_AN_female=34;controls_AF_female=4.70588e-01;controls_nhomalt_female=2;non_topmed_AC_fin=45;non_topmed_AN_fin=78;non_topmed_AF_fin=5.76923e-01;non_topmed_nhomalt_fin=10;non_topmed_AC_nfe_female=5;non_topmed_AN_nfe_female=26;non_topmed_AF_nfe_female=1.92308e-01;non_topmed_nhomalt_nfe_female=0;controls_AC_asj_male=0;controls_AN_asj_male=0;controls_nhomalt_asj_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=0;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=3;non_neuro_AN_oth=8;non_neuro_AF_oth=3.75000e-01;non_neuro_nhomalt_oth=0;AC_male=41;AN_male=92;AF_male=4.45652e-01;nhomalt_male=12;controls_AC_fin_female=5;controls_AN_fin_female=8;controls_AF_fin_female=6.25000e-01;controls_nhomalt_fin_female=1;controls_AC_asj_female=1;controls_AN_asj_female=2;controls_AF_asj_female=5.00000e-01;controls_nhomalt_asj_female=0;AC_amr_male=3;AN_amr_male=4;AF_amr_male=7.50000e-01;nhomalt_amr_male=1;AC_amr_female=3;AN_amr_female=4;AF_amr_female=7.50000e-01;nhomalt_amr_female=1;AC_oth_male=4;AN_oth_male=8;AF_oth_male=5.00000e-01;nhomalt_oth_male=1;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=0;non_neuro_nhomalt_nfe_seu=0;non_topmed_AC_afr_female=0;non_topmed_AN_afr_female=2;non_topmed_AF_afr_female=0.00000e+00;non_topmed_nhomalt_afr_female=0;non_topmed_AC_afr=2;non_topmed_AN_afr=12;non_topmed_AF_afr=1.66667e-01;non_topmed_nhomalt_afr=0;controls_AC=34;controls_AN=72;controls_AF=4.72222e-01;controls_nhomalt=8;non_neuro_AC_oth_female=2;non_neuro_AN_oth_female=4;non_neuro_AF_oth_female=5.00000e-01;non_neuro_nhomalt_oth_female=0;non_topmed_faf95_amr=3.26626e-01;non_topmed_faf99_amr=3.26626e-01;faf95_afr=2.53825e-02;faf99_afr=2.53821e-02;controls_faf95_afr=0.00000e+00;controls_faf99_afr=0.00000e+00;faf95_amr=3.26626e-01;faf99_amr=3.26626e-01;faf95_eas=1.77680e-01;faf99_eas=1.77680e-01;faf95=3.65988e-01;faf99=3.65988e-01;non_neuro_faf95_afr=5.12900e-03;non_neuro_faf99_afr=5.12900e-03;non_neuro_faf95_amr=2.04422e-01;non_neuro_faf99_amr=2.04423e-01;controls_faf95_nfe=3.18592e-01;controls_faf99_nfe=3.18591e-01;non_topmed_faf95=3.87517e-01;non_topmed_faf99=3.87517e-01;non_neuro_faf95_nfe=2.25669e-01;non_neuro_faf99_nfe=2.25670e-01;non_neuro_faf95=2.81191e-01;non_neuro_faf99=2.81191e-01;non_topmed_faf95_nfe=2.00594e-01;non_topmed_faf99_nfe=2.00595e-01;controls_faf95_eas=1.77680e-01;controls_faf99_eas=1.77680e-01;faf95_nfe=1.74711e-01;faf99_nfe=1.74712e-01;non_topmed_faf95_eas=1.77680e-01;non_topmed_faf99_eas=1.77680e-01;controls_faf95_amr=2.04422e-01;controls_faf99_amr=2.04423e-01;non_neuro_faf95_eas=1.77680e-01;non_neuro_faf99_eas=1.77680e-01;non_topmed_faf95_afr=2.96130e-02;non_topmed_faf99_afr=2.96133e-02;controls_faf95=3.47362e-01;controls_faf99=3.47362e-01;controls_popmax=eas;controls_AC_popmax=2;controls_AN_popmax=2;controls_AF_popmax=1.00000e+00;controls_nhomalt_popmax=1;popmax=eas;AC_popmax=2;AN_popmax=2;AF_popmax=1.00000e+00;nhomalt_popmax=1;age_hist_het_bin_freq=2|1|1|1|0|2|0|2|1|0;age_hist_het_n_smaller=4;age_hist_het_n_larger=0;age_hist_hom_bin_freq=3|0|0|0|0|0|2|0|1|0;age_hist_hom_n_smaller=0;age_hist_hom_n_larger=0;non_neuro_popmax=eas;non_neuro_AC_popmax=2;non_neuro_AN_popmax=2;non_neuro_AF_popmax=1.00000e+00;non_neuro_nhomalt_popmax=1;non_topmed_popmax=eas;non_topmed_AC_popmax=2;non_topmed_AN_popmax=2;non_topmed_AF_popmax=1.00000e+00;non_topmed_nhomalt_popmax=1;vep=C|intergenic_variant|MODIFIER|||||||||||||||rs374742143|1||||SNV|1||||||||||||||||||||||||||||||||||||||||||||

My output is in Postgres table:

as_id   as_nm      as_at_id   as_at_nm      vcf_id  variant_id  as_at_line_seq  date_start  date_end    as_at_val_seq   as_at_va_nm                         as_at_value
1       VCF         1           Variant     ""      1           1                                       1               CHROM                           22
1       VCF         1           Variant     ""      1           1                                       1               POS                             16050036
1       VCF         1           Variant     ""      1           1                                       1               ID                              rs374742143
1       VCF         1           Variant     ""      1           1                                       1               REF                             A
1       VCF         1           Variant     ""      1           1                                       1               ALT                             C
1       VCF         1           Variant     ""      1           1                                       1               QUAL                            442156.34
1       VCF         1           Variant     ""      1           1                                       1               FILTER                          RF
1       VCF         1           Variant     ""      1           1                                       1               INFO/non_neuro_nhomalt_amr_male     0
1       VCF         1           Variant     ""      1           1                                       1               INFO/controls_AN_afr    4
1       VCF         1           Variant     ""      1           1                                       1               INFO/controls_AC_amr_female     3
1       VCF         1           Variant     ""      1           1                                       1               INFO/AF_oth_female      0.5
1       VCF         1           Variant     ""      1           1                                       1               INFO/non_topmed_nhomalt_afr_female  0

My script is taking ~60 minutes to process 100K lines in my source VCF file. The most time is being spent on inserting in Postgres.

How to optimize the insertion logic to speed this up?

Upvotes: 0

Views: 947

Answers (3)

Carl Kristensen
Carl Kristensen

Reputation: 481

Try using this method with sqlalchemy, it will insert multiple rows (1000) into a single insert statement, making the insert process much faster for large dataset. In my case it went from 4 days into 20 minutes to insert 12 million rows.

import pandas as pd
from sqlalchemy import create_engine
somethinghuge = ...


engine = create_engine(f'postgresql+psycopg2://{USER_NAME}:{PASS}@{HOST}/{DATABASE}')
df_dataframe = pd.DataFrame(somethinghuge)

df_dataframe.to_sql('table_name', engine, index=False, schema='schema_name',
                  if_exists='replace', method='multi', chunksize=1000)

The method='multi' and chuksize is what does the magic.

Instead of inserting into postgres one line at the time it will write an insert with 1000 rows.

check the documentation for more details about method: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html

Upvotes: 0

Yong Wang
Yong Wang

Reputation: 1313

I think you can improve the speed 10~100times if switch to pandas to sql method with special postgresql method. I use to compare the normal insert vs pandas postgresql copy method. the speed is 10~100times fast depends on your data and table design.

here is the sample code. 1. use sqlalchemy db engine as follow:

from sqlalchmey import create_engine

engine = create_engine('postgres://postgres:postgres@yourserver/dbname')
  1. define Alternative to_sql() method for DBs that support COPY FROM. Pandas online Manual: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-sql

    from io import StringIO
    
    def psql_insert_copy(table, conn, keys, data_iter):
    
        # gets a DBAPI connection that can provide a cursor
        dbapi_conn = conn.connection
        with dbapi_conn.cursor() as cur:
        s_buf = StringIO()
    
        writer = csv.writer(s_buf)
        writer.writerows(data_iter)
        s_buf.seek(0)
    
        columns = ', '.join('"{}"'.format(k) for k in keys)
        if table.schema:
            table_name = '{}.{}'.format(table.schema, table.name)
        else:
            table_name = table.name
    
        sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
                              table_name, columns)
        cur.copy_expert(sql=sql, file=s_buf)
    
  2. transform your data for pandas Dataframe format. assuming the prepared data in df1

  3. send the data to database via to_sql

    df1.tosql('yourtablename',engine=engin,method=psql_insert_copy) 
    

In my case, at least 10~20K rows per seconds insert speed can be reached in 2 core/16Gmem docker container.

Good luck. if you found the answer is useful, pls vote it up.

Upvotes: 0

Amiram
Amiram

Reputation: 1295

I know that in SQL it take some time to insert/update 100K lines or event performing queries on large not indexed tables.

I do not know VCF that well but try to consider following in order to speed up your process:

  1. Indexing the table.
  2. Performing several insert command and only then execute commit

Upvotes: 1

Related Questions