Mando
Mando

Reputation: 75

Write rows from Postgres Table into a CSV file using Pandas

I want to writerows to the given NAS(mnt) folder from a postgres table, but only after performing some data hygine checks

Below is the actual code which is working, but it only selects columns from the table and saves the data into a .txt file

import os
import psycopg2
import time
import pandas as pd

# File path and name.
filePath = '/mnt/nasdatavant/datavant/covid_implementation/postgres_files/'
timestr = time.strftime("%Y-%m-%d-%-H%M%S")
fileName = 'covid-' + timestr + '.txt'

# Database connection variable.
connect = None

# Check if the file path exists.
if os.path.exists(filePath):
    connect = psycopg2.connect(<connection_details_here>)
    with connect.cursor() as cursor:
        sqlSelect = "select patnt_last_nm as cust_last_nm, \
                     patnt_frst_nm as cust_frst_nm, \
                     date(patnt_brth_dt) as cust_brth_dt, \
                     patnt_gendr_cd as cust_gendr_cd, \
                     patnt_postl_cd as cust_postl_cd, \
                     indiv_entpr_id from datavant_o.covid_patnt_registry"
        cursor.execute(sqlSelect)
        results = cursor.fetchall()
        print(results)
        headers = [i[0] for i in cursor.description]
        csvFile = csv.writer(open(filePath + fileName, 'w', newline=''),
                                delimiter='|', lineterminator='\r\n', escapechar='\\')
        csvFile.writerow(headers)
        csvFile.writerows(results)
        connect.commit()
        connect.close()
else:
        print("File path does not exist.")

Note: when i say print(results) it yields a list of tuples eg:

[('TOBY', 'MICHEAL', datetime.date(1986, 8, 23), 'M', '06472', '872956'), 
('CARLIE', 'NAHLA', datetime.date(1979, 9, 29),..etc]

so to yield a dataframe i wrote df = pd.DataFrame(results)

What i actually want now is to add data hygiene checks like below before writing into a .txt file

what i tried:

       csvFile = csv.writer(open(filePath + fileName, 'w', newline=''),
                                delimiter='|', lineterminator='\r\n', escapechar='\\')
        df = pd.DataFrame(results)
        print(df)
        df = df.dropna(axis=0)
        df = df.loc[
                  (df[0].astype('str').str.len()>1) & (df[1].astype('str').str.len()>1) &
                  (df[4].astype('str').str.len()>4) & (df[4].astype('str').str.len()<8)]
        csvFile.writerow(headers)
        csvFile.writerows(df)

error i got:

error: 
csvFile.writerows(df)
_csv.Error: iterable expected, not int

Final expected output(in .txt file):

cust_last_nm|cust_frst_nm|cust_brth_dt|cust_gendr_cd|cust_postl_cd|indiv_entpr_id
TOBY|MICHEAL|1986-08-23|M|06472|872956
CARLIE|NAHLA|1979-09-29|F|06757|499666 …etc

i need some help to solve this scenario (new to pandas) thanks ahead.

Upvotes: 0

Views: 1091

Answers (1)

pguardati
pguardati

Reputation: 229

import os
import psycopg2
import time
import pandas as pd

# File path and name.
filePath = '/mnt/nasdatavant/datavant/covid_implementation/postgres_files/'
timestr = time.strftime("%Y-%m-%d-%-H%M%S")
fileName = 'covid-' + timestr + '.csv'

# Check if the file path exists.
if os.path.exists(filePath):
    connect = psycopg2.connect("dbname=postgres user=postgres")
    with connect.cursor() as cursor:

        # create test table
        cursor.execute("create table if not exists test_table (test_column int)")
        cursor.execute("insert into test_table (test_column) values (10);")

        # query and import as Dataframe
        df = pd.read_sql("select * from test_table", connect)

        # add here your cleaning operations
        df = df.dropna(axis=0)

        # export
        df.to_csv(os.path.join(filePath, fileName), index=False)

        connect.commit()
        connect.close()
else:
    print("File path does not exist.")

Upvotes: 1

Related Questions