Python clean text file to make it searchable

Question

I have a very messy text file that consist of both comma and space seperated data that looks like the following:

NBLOCK,3,,13
(1i9,3e20.9e3)
        1     4.000000000E+01    -6.000000000E+01     0.000000000E+00
        2     4.000000000E+01     6.000000000E+01     0.000000000E+00
        3     4.000000000E+01    -2.000000000E+01     0.000000000E+00
        4     4.000000000E+01     2.000000000E+01     0.000000000E+00

I need to clean up the file such that I get an easily searchable 2d array using the following rules:

make letters lower case
delete repeating spaces
replace spaces by commas
delete comma if line is starting with a comma
convert to numpy 2d array

The output should look something like the following

my_array = [['nblock','3','','13'],
['(1i9','3e20.9e3)','',''],
['1','4.000000000e+01','-6.000000000e+01','0.000000000e+00'],
['2','4.000000000e+01','6.000000000e+01','0.000000000e+00'],
['3','4.000000000e+01','-2.000000000e+01','0.000000000e+00'],
['4','4.000000000e+01','2.000000000e+01','0.000000000e+00']]

Update more realistic example of text file:

ET,       1, 42
KEYOP,   1, 2,        1
KEYOP,   1, 3,        3
RLBLOCK,       1,       1,       6,       7
N,R5.3,LOC, -1,
NBLOCK,3,,13
(1i9,3e20.9e3)
1     4.000000000E+01    -6.000000000E+01     0.000000000E+00
        2     4.000000000E+01     6.000000000E+01     0.000000000E+00
        3     4.000000000E+01    -2.000000000E+01     0.000000000E+00
        4     4.000000000E+01     2.000000000E+01     0.000000000E+00
-1
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,EX  ,       1, 1, 2.100000000E+11,
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,NUXY,       1, 1, 0.300000000    ,
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,DENS,       1, 1,  7800.00000    ,
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,PRXY,       1, 1, 0.300000000    ,
EXTOPT,ATTR,  0,  0,  0
EXTOPT,ESIZE,  0,  0.0000    
EXTOPT,ACLEAR,  0
BFUNIF,TEMP,_TINY

Another example:

DMPOPT,EMAT,NO
*IF,_CDRDOFF,EQ,1,THEN     !if solid model was read in
_CDRDOFF=             !reset flag, numoffs already performed
*ELSE              !offset database for the following FE model
NUMOFF,NODE,       12
NUMOFF,ELEM,        8
NUMOFF,MAT ,        1
NUMOFF,REAL,        1
NUMOFF,TYPE,        2
NUMOFF,CSYS,       12
*ENDIF
KUSE,     0
TIME,  0.00000000
NBLOCK,6,SOLID,        12,        12
(3i9,6e21.13e3)
        1        0        0 4.0000000000000E+001-6.0000000000000E+001
        2        0        0 4.0000000000000E+001 6.0000000000000E+001
        3        0        0 4.0000000000000E+001-2.0000000000000E+001
        4        0        0 4.0000000000000E+001 2.0000000000000E+001   
EBLOCK,19,SOLID,    1250
(19i8)
       1       1       1       1
       1       1       1       1       
       1       1       1       1
-1
N,R5.3,LOC,     -1,
MPDATA,R5.0, 1,EX  ,       1, 1, 2.100000000E+11,
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,NUXY,       1, 1, 0.300000000    ,
MPTEMP,R5.0, 1, 1,  0.00000000    ,

MPDATA,R5.0, 1,DENS,       1, 1,  7800.00000    ,
MPTEMP,R5.0, 1, 1,  0.00000000    ,
MPDATA,R5.0, 1,PRXY,       1, 1, 0.300000000

Tim Roberts · Accepted Answer

Consider this kind of solution:

import numpy as np
from pprint import pprint

class Block:
    def __init__(self):
        self.data = {}
        self.array = []

    def ingest(self, lines):
        for line in lines:
            if 'A' <= line[0] <= 'Z':
                parts = [k.strip() for k in line.split(',')]
                parts = [int(k) if k.isdigit() else k for k in parts]
                key = parts[0].lower()
                if key not in self.data:
                    self.data[key] = [parts[1:]]
                else:
                    self.data[key].append( parts[1:] )
            elif line[0] == '(':
                self.data['nblock'].append( [line.strip().split(',')] )
                self.data['nblock'].append( [] )
            else:
                parts = line.split()
                if parts[0] != '-1':
                    self.data['nblock'][2].append( [float(k) for k in parts[1:]] )

blk = Block()
blk.ingest( open('x.txt') )
pprint(blk.data)

Output:

{'bfunif': [['TEMP', '_TINY']],
 'et': [[1, 42]],
 'extopt': [['ATTR', 0, 0, 0], ['ESIZE', 0, '0.0000'], ['ACLEAR', 0]],
 'keyop': [[1, 2, 1], [1, 3, 3]],
 'mpdata': [['R5.0', 1, 'EX', 1, 1, '2.100000000E+11', ''],
            ['R5.0', 1, 'NUXY', 1, 1, '0.300000000', ''],
            ['R5.0', 1, 'DENS', 1, 1, '7800.00000', ''],
            ['R5.0', 1, 'PRXY', 1, 1, '0.300000000', '']],
 'mptemp': [['R5.0', 1, 1, '0.00000000', ''],
            ['R5.0', 1, 1, '0.00000000', ''],
            ['R5.0', 1, 1, '0.00000000', ''],
            ['R5.0', 1, 1, '0.00000000', '']],
 'n': [['R5.3', 'LOC', '-1', '']],
 'nblock': [[3, '', 13],
            [['(1i9', '3e20.9e3)']],
            [[40.0, -60.0, 0.0],
             [40.0, 60.0, 0.0],
             [40.0, -20.0, 0.0],
             [40.0, 20.0, 0.0]]],
 'rlblock': [[1, 1, 6, 7]]}

Note that I've provided a string lines iterator, but you could also pass a file. Anything that will iterate through a list of strings.

Python clean text file to make it searchable

Answers (1)

Related Questions