Reputation: 75
I want to writerows to the given NAS(mnt) folder from a postgres table, but only after performing some data hygine checks
Below is the actual code which is working, but it only selects columns from the table and saves the data into a .txt file
import os
import psycopg2
import time
import pandas as pd
# File path and name.
filePath = '/mnt/nasdatavant/datavant/covid_implementation/postgres_files/'
timestr = time.strftime("%Y-%m-%d-%-H%M%S")
fileName = 'covid-' + timestr + '.txt'
# Database connection variable.
connect = None
# Check if the file path exists.
if os.path.exists(filePath):
connect = psycopg2.connect(<connection_details_here>)
with connect.cursor() as cursor:
sqlSelect = "select patnt_last_nm as cust_last_nm, \
patnt_frst_nm as cust_frst_nm, \
date(patnt_brth_dt) as cust_brth_dt, \
patnt_gendr_cd as cust_gendr_cd, \
patnt_postl_cd as cust_postl_cd, \
indiv_entpr_id from datavant_o.covid_patnt_registry"
cursor.execute(sqlSelect)
results = cursor.fetchall()
print(results)
headers = [i[0] for i in cursor.description]
csvFile = csv.writer(open(filePath + fileName, 'w', newline=''),
delimiter='|', lineterminator='\r\n', escapechar='\\')
csvFile.writerow(headers)
csvFile.writerows(results)
connect.commit()
connect.close()
else:
print("File path does not exist.")
Note: when i say print(results) it yields a list of tuples eg:
[('TOBY', 'MICHEAL', datetime.date(1986, 8, 23), 'M', '06472', '872956'),
('CARLIE', 'NAHLA', datetime.date(1979, 9, 29),..etc]
so to yield a dataframe i wrote df = pd.DataFrame(results)
What i actually want now is to add data hygiene checks like below before writing into a .txt file
what i tried:
csvFile = csv.writer(open(filePath + fileName, 'w', newline=''),
delimiter='|', lineterminator='\r\n', escapechar='\\')
df = pd.DataFrame(results)
print(df)
df = df.dropna(axis=0)
df = df.loc[
(df[0].astype('str').str.len()>1) & (df[1].astype('str').str.len()>1) &
(df[4].astype('str').str.len()>4) & (df[4].astype('str').str.len()<8)]
csvFile.writerow(headers)
csvFile.writerows(df)
error i got:
error:
csvFile.writerows(df)
_csv.Error: iterable expected, not int
Final expected output(in .txt file):
cust_last_nm|cust_frst_nm|cust_brth_dt|cust_gendr_cd|cust_postl_cd|indiv_entpr_id
TOBY|MICHEAL|1986-08-23|M|06472|872956
CARLIE|NAHLA|1979-09-29|F|06757|499666 …etc
i need some help to solve this scenario (new to pandas) thanks ahead.
Upvotes: 0
Views: 1091
Reputation: 229
import os
import psycopg2
import time
import pandas as pd
# File path and name.
filePath = '/mnt/nasdatavant/datavant/covid_implementation/postgres_files/'
timestr = time.strftime("%Y-%m-%d-%-H%M%S")
fileName = 'covid-' + timestr + '.csv'
# Check if the file path exists.
if os.path.exists(filePath):
connect = psycopg2.connect("dbname=postgres user=postgres")
with connect.cursor() as cursor:
# create test table
cursor.execute("create table if not exists test_table (test_column int)")
cursor.execute("insert into test_table (test_column) values (10);")
# query and import as Dataframe
df = pd.read_sql("select * from test_table", connect)
# add here your cleaning operations
df = df.dropna(axis=0)
# export
df.to_csv(os.path.join(filePath, fileName), index=False)
connect.commit()
connect.close()
else:
print("File path does not exist.")
Upvotes: 1