Reputation: 1484
I have multiple text files but I take 1 as example (name as test.txt
):
test.txt
##TSTA##
Record_01_TEST_0001ACA0018
Record_01_TEST_0002BRF: S01TX2OL7L
Record_01_TEST_0003CLASTNAME1, FIRSTNAME1 MIDDLENAME1
Record_01_TEST_0004CADDRESS1
Record_01_TEST_0005CADDRESS2
Record_01_TEST_0006D03141979*******M
Record_01_TEST_0007EINFOE
Record_01_TEST_0008FINFOF_1
Record_01_TEST_0009FINFOF_2
Record_01_TEST_0010FINFOF_3
Record_01_TEST_0011FINFOF_4
Record_01_TEST_0012FINFOF_5
Record_01_TEST_0013FINFOF_6
Record_01_TEST_0014FINFOF_7
Record_01_TEST_0015FINFOF_8
Record_01_TEST_0016J
Record_01_TEST_0017K
Record_01_TEST_0018L
Record_02_TEST_0001ANC0013
Record_02_TEST_0002BRF: S01TX2WWS1
Record_02_TEST_0003CLASTNAME2, FIRSTNAME2 MIDDLENAME2
Record_02_TEST_0004CADDRESS1
Record_02_TEST_0005CADDRESS2
Record_02_TEST_0006D12211961 F
Record_02_TEST_0007EINFOE
Record_02_TEST_0008FINFOF_1
Record_02_TEST_0009GINFOG_1
Record_02_TEST_0010GINFOG_2
Record_02_TEST_0011J
Record_02_TEST_0012K
Record_02_TEST_0013L
Record_03_TEST_0001ANC0021
Record_03_TEST_0002BRF: S01TX2WWZJ
Record_03_TEST_0003CLASTNAME3, FIRSTNAME3 MIDDLENAME3
Record_03_TEST_0004CADDRESS1
Record_03_TEST_0005CADDRESS2
Record_03_TEST_0006CADDRESS3
Record_03_TEST_0007D092019955 ** M*****
Record_03_TEST_0008EINFOE
Record_03_TEST_0009FINFOF_1
Record_03_TEST_0010FINFOF_2
Record_03_TEST_0011FINFOF_3
Record_03_TEST_0012FINFOF_4
Record_03_TEST_0013FINFOF_5
Record_03_TEST_0014FINFOF_6
Record_03_TEST_0015HINFOH_1
Record_03_TEST_0016HINFOH_2
Record_03_TEST_0017HINFOH_3
Record_03_TEST_0018HINFOH_4
Record_03_TEST_0019J
Record_03_TEST_0020K
Record_03_TEST_0021L
##TEND##
I want to read this fixed width text file and use 20th character to conditionally create DataFrame.
CODE
import pandas as pd
import numpy as np
TYPE___A = {'ST': None, 'COUNT': None}
TYPE___B = {'KEY': None}
TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}
TYPE___D = {'DOB': None, 'GENDER': None}
TYPE___E = {'E_INFO': None}
TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}
TYPE___G = {'G_INFO': None}
TYPE___J = {'J_INFO': None}
TYPE___K = {'K_INFO': None}
TYPE___L = {'L_INFO': None}
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
output = pd.DataFrame(columns = rowDict.keys())
with open("test.txt", 'r') as file:
for line in file:
if line[:8] == "##TSTA##":
continue
elif line[19:20] == "A":
rowDict['ST'] = line[20:22]
rowDict['COUNT'] = line[22:26]
elif line[19:20] == "B":
rowDict['KEY'] = line[20:80]
elif line[19:20] == "C":
for i in range(0,4):
rowDict[f'C_INFO{i+1}'] = line[20:60]
elif line[19:20] == "D":
rowDict['DOB'] = line[20:28]
rowDict['GENDER'] = line[35:36]
elif line[19:20] == "E":
rowDict['E_INFO'] = line[20:39]
elif line[19:20] == "F":
for i in range(0,9):
rowDict[f'F_INFO{i+1}'] = line[20:60]
elif line[19:20] == "G":
rowDict['G_INFO'] = line[20:39]
elif line[19:20] == "J":
rowDict['J_INFO'] = line[20:39]
elif line[19:20] == "K":
rowDict['K_INFO'] = line[20:39]
elif line[19:20] == "L":
rowDict['L_INFO'] = line[20:39]
elif line[:8] == "##TEND##":
break
tmp = pd.DataFrame([rowDict])
output = pd.concat([output, tmp])
However, I should get 3 rows and 23 columns instead of 52 rows. I think my program read each line and append duplicated.
I changed my program to:
import pandas as pd
import numpy as np
TYPE___A = {'ST': None, 'COUNT': None}
TYPE___B = {'KEY': None}
TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}
TYPE___D = {'DOB': None, 'GENDER': None}
TYPE___E = {'E_INFO': None}
TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}
TYPE___G = {'G_INFO': None}
TYPE___J = {'J_INFO': None}
TYPE___K = {'K_INFO': None}
TYPE___L = {'L_INFO': None}
output = pd.DataFrame()
with open(path + "test.txt", 'r') as file:
for line in file:
if line[:8] == "##TSTA##":
continue
elif line[19:20] == "A":
TYPE___A['ST'] = line[20:22]
TYPE___A['COUNT'] = line[22:26]
elif line[19:20] == "B":
TYPE___B['KEY'] = line[20:80]
elif line[19:20] == "C":
for i in range(0,4):
TYPE___C[f'C_INFO{i+1}'] = line[20:60]
elif line[19:20] == "D":
TYPE___D['DOB'] = line[20:28]
TYPE___D['GENDER'] = line[35:36]
elif line[19:20] == "E":
TYPE___E['E_INFO'] = line[20:39]
elif line[19:20] == "F":
for i in range(0,9):
TYPE___F[f'F_INFO{i+1}'] = line[20:60]
elif line[19:20] == "G":
TYPE___G['G_INFO'] = line[20:39]
elif line[19:20] == "J":
TYPE___J['J_INFO'] = line[20:39]
elif line[19:20] == "K":
TYPE___K['K_INFO'] = line[20:39]
elif line[19:20] == "L":
TYPE___L['L_INFO'] = line[20:39]
elif line[:8] == "##TEND##":
break
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
tmp = pd.DataFrame([rowDict])
output = pd.concat([output, tmp])
Still wrong!
The logic for program is:
'ST'
and line[22:26] is column 'COUNT'
.'KEY'
.'C_INFO{i+1}'
. This part should be different with others. The final dataframe has C_INFO1
to C_INFO4
, and it meets first "C" then put in C_INFO1
, elif meets second "C" then put in C_INFO2
, elif meets third "C" then put in C_INFO3
, elif meets fourth "C" then put in C_INFO4
.'DOB'
and line[35:36] is column 'GENDER'
.'E_INFO'
.INFOG_2
.I updated the code and it output 3 rows. However, the C_INFO1
to C_INFO4
and F_INFO1
to F_INFO9
are not parsed correctly. It retrieved the last one.
import pandas as pd
import numpy as np
TYPE___A = {'ST': None, 'COUNT': None}
TYPE___B = {'KEY': None}
TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}
TYPE___D = {'DOB': None, 'GENDER': None}
TYPE___E = {'E_INFO': None}
TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}
TYPE___G = {'G_INFO': None}
TYPE___J = {'J_INFO': None}
TYPE___K = {'K_INFO': None}
TYPE___L = {'L_INFO': None}
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
output = pd.DataFrame(columns = rowDict.keys())
with open(path + "test.txt", 'r') as file:
for line in file:
if line[:8] == "##TSTA##":
continue
elif line[19:20] == "A":
ID_A = line[0:15]
TYPE___A['ST'] = line[20:22]
TYPE___A['COUNT'] = line[22:26]
elif (line[19:20] == "B") & (line[0:15] == ID_A):
TYPE___B['KEY'] = line[20:80]
elif (line[19:20] == "C") & (line[0:15] == ID_A):
for i in range(0,4):
TYPE___C[f'C_INFO{i+1}'] = line[20:60]
elif (line[19:20] == "D") & (line[0:15] == ID_A):
TYPE___D['DOB'] = line[20:28]
TYPE___D['GENDER'] = line[35:36]
elif (line[19:20] == "E") & (line[0:15] == ID_A):
TYPE___E['E_INFO'] = line[20:39]
elif (line[19:20] == "F") & (line[0:15] == ID_A):
for i in range(0,9):
TYPE___F[f'F_INFO{i+1}'] = line[20:60]
elif (line[19:20] == "G") & (line[0:15] == ID_A):
TYPE___G['G_INFO'] = line[20:39]
elif (line[19:20] == "J") & (line[0:15] == ID_A):
TYPE___J['J_INFO'] = line[20:39]
elif (line[19:20] == "K") & (line[0:15] == ID_A):
TYPE___K['K_INFO'] = line[20:39]
elif (line[19:20] == "L") & (line[0:15] == ID_A):
TYPE___L['L_INFO'] = line[20:39]
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
tmp = pd.DataFrame([rowDict])
output = pd.concat([output, tmp])
elif line[:8] == "##TEND##":
break
Any Idea? Thanks!
Upvotes: 1
Views: 371
Reputation: 499
import pandas as pd
import numpy as np
def define_empty_dict():
TYPE___A = {'ST': None, 'COUNT': None}
TYPE___B = {'KEY': None}
TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}
TYPE___D = {'DOB': None, 'GENDER': None}
TYPE___E = {'E_INFO': None}
TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}
TYPE___G = {'G_INFO': None}
TYPE___J = {'J_INFO': None}
TYPE___K = {'K_INFO': None}
TYPE___L = {'L_INFO': None}
return TYPE___A, TYPE___B, TYPE___C, TYPE___D, TYPE___E,TYPE___F, TYPE___G, TYPE___J, TYPE___K, TYPE___L
TYPE___A, TYPE___B, TYPE___C, TYPE___D, TYPE___E,TYPE___F, TYPE___G, TYPE___J, TYPE___K, TYPE___L = define_empty_dict()
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
output = pd.DataFrame(columns = rowDict.keys())
##
with open("test.txt", 'r') as file:
for i, line in enumerate(file):
if line[:8] == "##TSTA##":
continue
elif line[19:20] == "A":
ID_A = line[0:15]
TYPE___A['ST'] = line[20:22]
TYPE___A['COUNT'] = line[22:26]
elif (line[19:20] == "B") & (line[0:15] == ID_A):
TYPE___B['KEY'] = line[20:80]
elif (line[19:20] == "C") & (line[0:15] == ID_A):
if "number_ref_C" not in globals():
number_ref_C = int(line[15:19])
c = 1
TYPE___C[f'C_INFO{c}'] = line[20:60]
else :
c += 1
TYPE___C[f'C_INFO{c}'] = line[20:60]
elif (line[19:20] == "D") & (line[0:15] == ID_A):
TYPE___D['DOB'] = line[20:28]
TYPE___D['GENDER'] = line[35:36]
elif (line[19:20] == "E") & (line[0:15] == ID_A):
TYPE___E['E_INFO'] = line[20:39]
elif (line[19:20] == "F") & (line[0:15] == ID_A):
if "number_ref_F" not in globals():
number_ref_F = int(line[15:19])
f = 1
TYPE___F[f'F_INFO{f}'] = line[20:60]
else :
f += 1
TYPE___F[f'F_INFO{f}'] = line[20:60]
elif (line[19:20] == "G") & (line[0:15] == ID_A):
TYPE___G['G_INFO'] = line[20:39]
elif (line[19:20] == "J") & (line[0:15] == ID_A):
TYPE___J['J_INFO'] = line[20:39]
elif (line[19:20] == "K") & (line[0:15] == ID_A):
TYPE___K['K_INFO'] = line[20:39]
elif (line[19:20] == "L") & (line[0:15] == ID_A):
TYPE___L['L_INFO'] = line[20:39]
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
tmp = pd.DataFrame([rowDict])
output = pd.concat([output, tmp])
TYPE___A, TYPE___B, TYPE___C, TYPE___D, TYPE___E,TYPE___F, TYPE___G, TYPE___J, TYPE___K, TYPE___L = define_empty_dict()
del number_ref_C, number_ref_F
elif line[:8] == "##TEND##":
break
pd.set_option('display.max_columns', None)
output
Upvotes: 1