Peter Chen
Peter Chen

Reputation: 1484

Python conditionally read fixed width text file and create DataFrame

I have multiple text files but I take 1 as example (name as test.txt):

test.txt

##TSTA##
Record_01_TEST_0001ACA0018
Record_01_TEST_0002BRF:                                       S01TX2OL7L                            
Record_01_TEST_0003CLASTNAME1, FIRSTNAME1 MIDDLENAME1                                                               
Record_01_TEST_0004CADDRESS1                                                               
Record_01_TEST_0005CADDRESS2                                                               
Record_01_TEST_0006D03141979*******M                                                          
Record_01_TEST_0007EINFOE
Record_01_TEST_0008FINFOF_1
Record_01_TEST_0009FINFOF_2
Record_01_TEST_0010FINFOF_3
Record_01_TEST_0011FINFOF_4
Record_01_TEST_0012FINFOF_5
Record_01_TEST_0013FINFOF_6
Record_01_TEST_0014FINFOF_7
Record_01_TEST_0015FINFOF_8
Record_01_TEST_0016J
Record_01_TEST_0017K
Record_01_TEST_0018L
Record_02_TEST_0001ANC0013
Record_02_TEST_0002BRF:                                       S01TX2WWS1                            
Record_02_TEST_0003CLASTNAME2, FIRSTNAME2 MIDDLENAME2                                                               
Record_02_TEST_0004CADDRESS1                                                               
Record_02_TEST_0005CADDRESS2                                                               
Record_02_TEST_0006D12211961       F                                                                
Record_02_TEST_0007EINFOE
Record_02_TEST_0008FINFOF_1
Record_02_TEST_0009GINFOG_1
Record_02_TEST_0010GINFOG_2
Record_02_TEST_0011J
Record_02_TEST_0012K
Record_02_TEST_0013L
Record_03_TEST_0001ANC0021
Record_03_TEST_0002BRF:                                       S01TX2WWZJ                            
Record_03_TEST_0003CLASTNAME3, FIRSTNAME3 MIDDLENAME3                                                               
Record_03_TEST_0004CADDRESS1
Record_03_TEST_0005CADDRESS2
Record_03_TEST_0006CADDRESS3
Record_03_TEST_0007D092019955 **   M*****
Record_03_TEST_0008EINFOE
Record_03_TEST_0009FINFOF_1 
Record_03_TEST_0010FINFOF_2
Record_03_TEST_0011FINFOF_3
Record_03_TEST_0012FINFOF_4
Record_03_TEST_0013FINFOF_5
Record_03_TEST_0014FINFOF_6
Record_03_TEST_0015HINFOH_1
Record_03_TEST_0016HINFOH_2
Record_03_TEST_0017HINFOH_3
Record_03_TEST_0018HINFOH_4
Record_03_TEST_0019J
Record_03_TEST_0020K
Record_03_TEST_0021L
##TEND##

I want to read this fixed width text file and use 20th character to conditionally create DataFrame.

CODE

import pandas as pd
import numpy as np

TYPE___A = {'ST': None, 'COUNT': None}

TYPE___B = {'KEY': None}

TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}

TYPE___D = {'DOB': None, 'GENDER': None}

TYPE___E = {'E_INFO': None}

TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}

TYPE___G = {'G_INFO': None}

TYPE___J = {'J_INFO': None}

TYPE___K = {'K_INFO': None}

TYPE___L = {'L_INFO': None}

rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}

output = pd.DataFrame(columns = rowDict.keys())

with open("test.txt", 'r') as file:
    for line in file:
        if line[:8] == "##TSTA##":
            continue
        elif line[19:20] == "A":
            rowDict['ST']    = line[20:22]
            rowDict['COUNT'] = line[22:26]
        elif line[19:20] == "B":
            rowDict['KEY']  = line[20:80]
        elif line[19:20] == "C":
            for i in range(0,4):
                rowDict[f'C_INFO{i+1}'] = line[20:60]
        elif line[19:20] == "D":
            rowDict['DOB']  = line[20:28]
            rowDict['GENDER']  = line[35:36]
        elif line[19:20] == "E":
            rowDict['E_INFO']  = line[20:39]
        elif line[19:20] == "F":
            for i in range(0,9):
                rowDict[f'F_INFO{i+1}'] = line[20:60]
        elif line[19:20] == "G":
            rowDict['G_INFO']  = line[20:39]
        elif line[19:20] == "J":
            rowDict['J_INFO']  = line[20:39]
        elif line[19:20] == "K":
            rowDict['K_INFO']  = line[20:39]
        elif line[19:20] == "L":
            rowDict['L_INFO']  = line[20:39]        
        elif line[:8] == "##TEND##":
            break
        tmp = pd.DataFrame([rowDict])
        output = pd.concat([output, tmp])

However, I should get 3 rows and 23 columns instead of 52 rows. I think my program read each line and append duplicated.

I changed my program to:

import pandas as pd
import numpy as np

TYPE___A = {'ST': None, 'COUNT': None}

TYPE___B = {'KEY': None}

TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}

TYPE___D = {'DOB': None, 'GENDER': None}

TYPE___E = {'E_INFO': None}

TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}

TYPE___G = {'G_INFO': None}

TYPE___J = {'J_INFO': None}

TYPE___K = {'K_INFO': None}

TYPE___L = {'L_INFO': None}

output = pd.DataFrame()

with open(path + "test.txt", 'r') as file:
    for line in file:
        if line[:8] == "##TSTA##":
            continue
        elif line[19:20] == "A":
            TYPE___A['ST']    = line[20:22]
            TYPE___A['COUNT'] = line[22:26]
        elif line[19:20] == "B":
            TYPE___B['KEY']  = line[20:80]
        elif line[19:20] == "C":
            for i in range(0,4):
                TYPE___C[f'C_INFO{i+1}'] = line[20:60]
        elif line[19:20] == "D":
            TYPE___D['DOB']  = line[20:28]
            TYPE___D['GENDER']  = line[35:36]
        elif line[19:20] == "E":
            TYPE___E['E_INFO']  = line[20:39]
        elif line[19:20] == "F":
            for i in range(0,9):
                TYPE___F[f'F_INFO{i+1}'] = line[20:60]
        elif line[19:20] == "G":
            TYPE___G['G_INFO']  = line[20:39]
        elif line[19:20] == "J":
            TYPE___J['J_INFO']  = line[20:39]
        elif line[19:20] == "K":
            TYPE___K['K_INFO']  = line[20:39]
        elif line[19:20] == "L":
            TYPE___L['L_INFO']  = line[20:39]        
        elif line[:8] == "##TEND##":
            break
        rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
        tmp = pd.DataFrame([rowDict])
        output = pd.concat([output, tmp])

Still wrong!

The logic for program is:

  1. If 20th character is "A", then line[20:22] is column 'ST' and line[22:26] is column 'COUNT'.
  2. If 20th character is "B", then line[20:80] is column 'KEY'.
  3. If 20th character is "C", then line[20:60] is column 'C_INFO{i+1}'. This part should be different with others. The final dataframe has C_INFO1 to C_INFO4, and it meets first "C" then put in C_INFO1, elif meets second "C" then put in C_INFO2, elif meets third "C" then put in C_INFO3, elif meets fourth "C" then put in C_INFO4.
  4. If 20th character is "D", then line[20:28] is column 'DOB' and line[35:36] is column 'GENDER'.
  5. If 20th character is "E", then line[20:39] is column 'E_INFO'.
  6. If 20th character is "F", the logic is similar to "C".
  7. If 20th character is "G", only take last to fill, so in this example, it should only use INFOG_2.
  8. If 20th character is "H", ignore.
  9. "J", "K", "L" are the same as normal logic.

I updated the code and it output 3 rows. However, the C_INFO1 to C_INFO4 and F_INFO1 to F_INFO9 are not parsed correctly. It retrieved the last one.

import pandas as pd
import numpy as np

TYPE___A = {'ST': None, 'COUNT': None}

TYPE___B = {'KEY': None}

TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}

TYPE___D = {'DOB': None, 'GENDER': None}

TYPE___E = {'E_INFO': None}

TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}

TYPE___G = {'G_INFO': None}

TYPE___J = {'J_INFO': None}

TYPE___K = {'K_INFO': None}

TYPE___L = {'L_INFO': None}

rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
output = pd.DataFrame(columns = rowDict.keys())

with open(path + "test.txt", 'r') as file:
    for line in file:
        if line[:8] == "##TSTA##":
            continue
        elif line[19:20] == "A":
            ID_A = line[0:15]
            TYPE___A['ST']    = line[20:22]
            TYPE___A['COUNT'] = line[22:26]
        elif (line[19:20] == "B") & (line[0:15] == ID_A):
            TYPE___B['KEY']  = line[20:80]
        elif (line[19:20] == "C") & (line[0:15] == ID_A):
            for i in range(0,4):
                TYPE___C[f'C_INFO{i+1}'] = line[20:60]
        elif (line[19:20] == "D") & (line[0:15] == ID_A):
            TYPE___D['DOB']  = line[20:28]
            TYPE___D['GENDER']  = line[35:36]
        elif (line[19:20] == "E") & (line[0:15] == ID_A):
            TYPE___E['E_INFO']  = line[20:39]
        elif (line[19:20] == "F") & (line[0:15] == ID_A):
            for i in range(0,9):
                TYPE___F[f'F_INFO{i+1}'] = line[20:60]
        elif (line[19:20] == "G") & (line[0:15] == ID_A):
            TYPE___G['G_INFO']  = line[20:39]
        elif (line[19:20] == "J") & (line[0:15] == ID_A):
            TYPE___J['J_INFO']  = line[20:39]
        elif (line[19:20] == "K") & (line[0:15] == ID_A):
            TYPE___K['K_INFO']  = line[20:39]
        elif (line[19:20] == "L") & (line[0:15] == ID_A):
            TYPE___L['L_INFO']  = line[20:39]
            rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
            tmp = pd.DataFrame([rowDict])
            output = pd.concat([output, tmp])
        elif line[:8] == "##TEND##":
            break

The desired layout should be:enter image description here

Any Idea? Thanks!

Upvotes: 1

Views: 371

Answers (1)

Denny Chen
Denny Chen

Reputation: 499

Here you go.enter image description here

import pandas as pd
import numpy as np

def define_empty_dict():
    TYPE___A = {'ST': None, 'COUNT': None}

    TYPE___B = {'KEY': None}

    TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}

    TYPE___D = {'DOB': None, 'GENDER': None}

    TYPE___E = {'E_INFO': None}

    TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}

    TYPE___G = {'G_INFO': None}

    TYPE___J = {'J_INFO': None}

    TYPE___K = {'K_INFO': None}

    TYPE___L = {'L_INFO': None}
    return TYPE___A, TYPE___B, TYPE___C, TYPE___D, TYPE___E,TYPE___F, TYPE___G, TYPE___J, TYPE___K, TYPE___L

TYPE___A, TYPE___B, TYPE___C, TYPE___D, TYPE___E,TYPE___F, TYPE___G, TYPE___J, TYPE___K, TYPE___L = define_empty_dict()
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
output = pd.DataFrame(columns = rowDict.keys())

##


with open("test.txt", 'r') as file:

    for i, line in enumerate(file):

        if line[:8] == "##TSTA##":
            continue
        elif line[19:20] == "A":
            ID_A = line[0:15]
            TYPE___A['ST']    = line[20:22]
            TYPE___A['COUNT'] = line[22:26]
        elif (line[19:20] == "B") & (line[0:15] == ID_A):
            TYPE___B['KEY']  = line[20:80]
        elif (line[19:20] == "C") & (line[0:15] == ID_A):
            if "number_ref_C" not in globals():
                number_ref_C = int(line[15:19])
                c = 1
                TYPE___C[f'C_INFO{c}'] = line[20:60]
            else :
                c += 1
                TYPE___C[f'C_INFO{c}'] = line[20:60]
        elif (line[19:20] == "D") & (line[0:15] == ID_A):
            TYPE___D['DOB']  = line[20:28]
            TYPE___D['GENDER']  = line[35:36]
        elif (line[19:20] == "E") & (line[0:15] == ID_A):
            TYPE___E['E_INFO']  = line[20:39]
        elif (line[19:20] == "F") & (line[0:15] == ID_A):
            if "number_ref_F" not in globals():
                number_ref_F = int(line[15:19])
                f = 1
                TYPE___F[f'F_INFO{f}'] = line[20:60]
            else :
                f += 1
                TYPE___F[f'F_INFO{f}'] = line[20:60]
        elif (line[19:20] == "G") & (line[0:15] == ID_A):
            TYPE___G['G_INFO']  = line[20:39]
        elif (line[19:20] == "J") & (line[0:15] == ID_A):
            TYPE___J['J_INFO']  = line[20:39]
        elif (line[19:20] == "K") & (line[0:15] == ID_A):
            TYPE___K['K_INFO']  = line[20:39]
        elif (line[19:20] == "L") & (line[0:15] == ID_A):
            TYPE___L['L_INFO']  = line[20:39]
            rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
            tmp = pd.DataFrame([rowDict])
            output = pd.concat([output, tmp])
            TYPE___A, TYPE___B, TYPE___C, TYPE___D, TYPE___E,TYPE___F, TYPE___G, TYPE___J, TYPE___K, TYPE___L = define_empty_dict()
            del number_ref_C, number_ref_F
        elif line[:8] == "##TEND##":
            break

pd.set_option('display.max_columns', None)
output

Upvotes: 1

Related Questions