vincent
vincent

Reputation: 1678

extract and split the content of a specific column into a new csv file with new columnser

I have a column (second column called second_column) in my csv file which represents à list of characters and its positions as follow: the column called character_position

Each line of this column contains a list of character_position . overall l have 300 lines in this column each with list of character position

character_position = [['1', 1890, 1904, 486, 505, '8', 1905, 1916, 486, 507, '4', 1919, 1931, 486, 505, '1', 1935, 1947, 486, 505, '7', 1950, 1962, 486, 505, '2', 1965, 1976, 486, 505, '9', 1980, 1992, 486, 507, '6', 1995, 2007, 486, 505, '/', 2010, 2022, 484, 508, '4', 2025, 2037, 486, 505, '8', 2040, 2052, 486, 505, '3', 2057, 2067, 486, 507, '3', 2072, 2082, 486, 505, '0', 2085, 2097, 486, 507, '/', 2100, 2112, 484, 508, 'Q', 2115, 2127, 486, 507, '1', 2132, 2144, 486, 505, '7', 2147, 2157, 486, 505, '9', 2162, 2174, 486, 505, '/', 2175, 2189, 484, 508, 'C', 2190, 2204, 487, 505, '4', 2207, 2219, 486, 505, '1', 2241, 2253, 486, 505, '/', 2255, 2268, 484, 508, '1', 2271, 2285, 486, 507, '5', 2288, 2297, 486, 505], ['D', 2118, 2132, 519, 535, '.', 2138, 2144, 529, 534, '2', 2150, 2162, 516, 535, '0', 2165, 2177, 516, 535, '4', 2180, 2192, 516, 534, '7', 2196, 2208, 516, 534, '0', 2210, 2223, 514, 535, '1', 2226, 2238, 516, 534, '8', 2241, 2253, 514, 534, '2', 2256, 2267, 514, 535, '4', 2270, 2282, 516, 534, '0', 2285, 2298, 514, 535]]

each character has for values : left, top, right, bottom. For instance character '1' has left=1890, top=1904, right=486, bottom=505.

l want to create another csv file with five columns :

column 1:  character, column 2 : left , column 3 : top, column 4 : right, column 5 : bottom.

Here is what l have done :

import pandas as pd :
character= []
left_position = []
right_position = []
top_position=[]
bottom_position []
temp_df = pd.DataFrame({'character': position_char, 'left': left_position, 'top': top_position, 'right' : right_position,'bottom': bottom_position})
temp_df.to_csv('character_position.csv')

my file that contains the column of characters_position in one column can be accessed as follow :

import csv
with open('list_characters.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        content = list(row[i] for i in second_column)

How can l extract and split the content of the second_column of the file list_characters.csv into a new csv file character_position.csv with five column ?

My first row of the column character_position is as follow :

[['m', 38, 104, 2456, 2492, 'i', 40, 102, 2442, 2448, 'i', 40, 100, 2402, 2410, 'l', 40, 102, 2372, 2382, 'm', 40, 102, 2312, 2358, 'u', 40, 102, 2292, 2310, 'i', 40, 104, 2210, 2260, 'l', 40, 104, 2180, 2208, 'i', 40, 104, 2140, 2166, 'l', 40, 104, 2124, 2134]]

My second row of the column character_position is as follow :

[['A', 73, 91, 373, 394, 'D', 93, 112, 373, 396, 'R', 115, 133, 373, 396, 'E', 136, 153, 373, 396, 'S', 156, 172, 373, 396, 'S', 175, 192, 373, 396, 'E', 195, 211, 373, 396, 'D', 222, 241, 373, 396, 'E', 244, 261, 373, 396, 'L', 272, 285, 375, 396, 'I', 288, 293, 375, 396, 'V', 296, 314, 375, 396, 'R', 317, 334, 373, 396, 'A', 334, 354, 375, 396, 'I', 357, 360, 373, 396, 'S', 365, 381, 373, 396, 'O', 384, 405, 373, 396, 'N', 408, 425, 373, 394]]

and here is how l access to character position column :

df = pd.read_csv(filepath_or_buffer='list_characters.csv', header=None, usecols=[1], names=['character_position])

df
                                     character_position
    0   [['m', 38, 104, 2456, 2492, 'i', 40, 102, 2442...
    1   [['.', 203, 213, 191, 198, '3', 235, 262, 131,...
    2   [['A', 275, 347, 147, 239, 'M', 363, 465, 145,...
    3   [['A', 73, 91, 373, 394, 'D', 93, 112, 373, 39...
    4   [['D', 454, 473, 663, 685, 'O', 474, 495, 664,...
    5   [['A', 108, 129, 727, 751, 'V', 129, 150, 727,...
    6   [['N', 34, 51, 949, 970, '/', 52, 61, 948, 970...
    7   [['S', 1368, 1401, 43, 85, 'A', 1406, 1446, 43...
    8   [['S', 1437, 1457, 112, 138, 'o', 1458, 1476, ...
    9   [['h', 1686, 1703, 315, 339, 't', 1706, 1715, ...
    10  [['N', 1331, 1349, 370, 391, 'C', 1361, 1379, ...
    11  [['N', 1758, 1775, 370, 391, 'D', 1785, 1803, ...
    12  [['D', 2166, 2184, 370, 391, 'A', 2186, 2205, ...
    13  [['2', 1395, 1415, 427, 454, '0', 1416, 1434, ...
    14  [['I', 1533, 1545, 487, 541, 'I', 1548, 1551, ...
    15  [['P', 1659, 1677, 490, 514, '2', 1680, 1697, ...
    16  [['1', 1890, 1904, 486, 505, '8', 1905, 1916, ...
    17  [['B', 1344, 1361, 583, 607, 'O', 1364, 1386, ...
    18  [['B', 1548, 1580, 979, 1015, 'T', 1586, 1619,...
    19  [['Q', 169, 190, 1291, 1312, 'U', 192, 210, 12...
    20  [['1', 296, 305, 1492, 1516, 'S', 339, 357, 14...
    21  [['G', 339, 362, 1815, 1840, 'S', 365, 384, 18...
    22  [['2', 1440, 1455, 2047, 2073, '9', 1458, 1475...
    23  [['R', 339, 360, 2137, 2163, 'e', 363, 378, 21...
    24  [['R', 339, 360, 1860, 1885, 'e', 363, 380, 18...
    25  [['0', 1266, 1283, 1951, 1977, ',', 1287, 1290...
    26  [['1', 2207, 2217, 1492, 1515, '0', 2225, 2240...
    27  [['1', 2364, 2382, 1552, 1585], [], ['E', 2369...
    28                    [['S', 2369, 2382, 1833, 1866]]
    29  [['0', 2243, 2259, 1951, 1977, '0', 2271, 2288...
    ..                                                ...
    70  [['1', 296, 305, 1492, 1516, 'S', 339, 357, 14...
    71  [['G', 339, 362, 1815, 1840, 'S', 365, 384, 18...
    72  [['2', 1440, 1455, 2047, 2073, '9', 1458, 1475...
    73  [['R', 339, 360, 2137, 2163, 'e', 363, 378, 21...
    74  [['R', 339, 360, 1860, 1885, 'e', 363, 380, 18...
    75  [['0', 1266, 1283, 1951, 1977, ',', 1287, 1290...
    76  [['1', 2207, 2217, 1492, 1515, '0', 2225, 2240...
    77  [['1', 2364, 2382, 1552, 1585], [], ['E', 2369...
    78                    [['S', 2369, 2382, 1833, 1866]]
    79  [['0', 2243, 2259, 1951, 1977, '0', 2271, 2288...
    80  [['0', 2243, 2259, 2227, 2253, '0', 2271, 2286...
    81  [['D', 76, 88, 2580, 2596, 'é', 91, 100, 2580,...
    82  [['ü', 1474, 1489, 2586, 2616, '3', 1541, 1557...
    83  [['E', 1440, 1461, 2670, 2697, 'U', 1466, 1488...
    84  [['2', 1685, 1703, 2670, 2697, '.', 1707, 1712...
    85  [['1', 2202, 2213, 2668, 2695, '3', 2220, 2237...
    86                       [['c', 88, 118, 2872, 2902]]
    87  [['N', 127, 144, 2889, 2910, 'D', 156, 175, 28...
    88  [['E', 108, 129, 3144, 3172, 'C', 133, 156, 31...
    89  [['5', 108, 126, 3204, 3231, '0', 129, 147, 32...
    90                                               [[]]
    91  [['1', 480, 492, 3202, 3229, '6', 500, 518, 32...
    92  [['P', 217, 234, 3337, 3360, 'A', 235, 255, 33...
    93                                               [[]]
    94  [['I', 954, 963, 2892, 2934, 'M', 969, 1011, 2...
    95  [['E', 1385, 1407, 2970, 2998, 'U', 1410, 1433...
    96  [['T', 2067, 2084, 2889, 2911, 'O', 2088, 2106...
    97  [['1', 2201, 2213, 2970, 2997, '6', 2219, 2238...
    98  [['M', 1734, 1755, 3246, 3267, 'O', 1758, 1779...
    99  [['L', 923, 935, 3411, 3430, 'A', 941, 957, 34...

when l try :

df['character_position'][1]
"[['.', 203, 213, 191, 198, '3', 235, 262, 131, 198]]"

df['character_position'][2]
"[['A', 275, 347, 147, 239, 'M', 363, 465, 145, 239, 'S', 485, 549, 145, 243, 'U', 569, 631, 145, 241, 'N', 657, 733, 145, 239]]"

then apply as you said :

df=pd.DataFrame(np.array(list(chain.from_iterable( df['character_position'][2]))).reshape(-1,5), 
                   columns=cols)

l get this error :

Traceback (most recent call last):
  File "/usr/lib/python3.5/code.py", line 91, in runcode
    exec(code, self.locals)
  File "<input>", line 1, in <module>
ValueError: cannot reshape array of size 127 into shape (5)

in order to check l did this :

Y=df['character_position'][2]

 "[['A', 275, 347, 147, 239, 'M', 363, 465, 145, 239, 'S', 485, 549, 145, 243, 'U', 569, 631, 145, 241, 'N', 657, 733, 145, 239]]"

then :

Y[0] gives '['

But we are supposed to get

['A', 275, 347, 147, 239, 'M', 363, 465, 145, 239, 'S', 485, 549, 145, 243, 'U', 569, 631, 145, 241, 'N', 657, 733, 145, 239]

since it's a nested list.

l think we need a loop which iterate the whole rows of that column :

df.ix[:len(df)]
 df=pd.DataFrame(np.array(list(chain.from_iterable( df['character_position'].ix[:len(df)]))).reshape(-1,5), 
                       columns=cols)

After executing the following code :

cols = ['char','left','top','right','bottom']
df1 = df.character_position.str.strip('[]').str.split(', ', expand=True)
df1.columns = [df1.columns % 5, df1.columns // 5]
df1 = df1.stack().reset_index(drop=True)
df1.columns = cols
df1[cols[1:]] = df1[cols[1:]].astype(int)

l got this error :

Traceback (most recent call last):
  File "/usr/lib/python3.5/code.py", line 91, in runcode
    exec(code, self.locals)
  File "<input>", line 1, in <module>
  File "/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py", line 3054, in astype
    raise_on_error=raise_on_error, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 3189, in astype
    return self.apply('astype', dtype=dtype, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 3056, in apply
    applied = getattr(b, f)(**kwargs)
  File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 461, in astype
    values=values, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 504, in _astype
    values = _astype_nansafe(values.ravel(), dtype, copy=True)
  File "/usr/local/lib/python3.5/dist-packages/pandas/types/cast.py", line 534, in _astype_nansafe
    return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
  File "pandas/lib.pyx", line 980, in pandas.lib.astype_intsafe (pandas/lib.c:17409)
  File "pandas/src/util.pxd", line 93, in util.set_value_at_unsafe (pandas/lib.c:72104)
ValueError: invalid literal for int() with base 10: "['E'"

The code runs until df1.columns = cols

when l print df1 :

    print(df1)
it works. The l saved it in a csv file.

    df1.to_csv('positions.csv')

Upvotes: 2

Views: 238

Answers (1)

jezrael
jezrael

Reputation: 862851

I think you need first convert strings to lists with strip and split.

Then use chain for falttening values and reshape. Last if necessary convert columns to int.

from  itertools import chain

#temporaly display content
with pd.option_context('display.max_colwidth', 120):
    print (df)
                                      character_position
0  [['m', 38, 104, 2456, 2492, 'i', 40, 102, 2442, 222]]
1  [['.', 203, 213, 191, 198, '3', 235, 262, 131, 3333]]
2  [['A', 275, 347, 147, 239, 'M', 363, 465, 145, 3334]]
3       [['A', 73, 91, 373, 394, 'D', 93, 112, 373, 39]]
4    [['D', 454, 473, 663, 685, 'O', 474, 495, 664, 33]]
5   [['A', 108, 129, 727, 751, 'V', 129, 150, 727, 444]]

df.character_position = df.character_position.str.strip('[]').str.split(', ')

cols = ['char','left','top','right','bottom']
df1 = pd.DataFrame(np.array(list(chain.from_iterable(df.character_position))).reshape(-1,5), 
                   columns=cols)
df1[cols[1:]] = df1[cols[1:]].astype(int)
print (df1)

0   'm'    38  104   2456    2492
1   'i'    40  102   2442     222
2   '.'   203  213    191     198
3   '3'   235  262    131    3333
4   'A'   275  347    147     239
5   'M'   363  465    145    3334
6   'A'    73   91    373     394
7   'D'    93  112    373      39
8   'D'   454  473    663     685
9   'O'   474  495    664      33
10  'A'   108  129    727     751
11  'V'   129  150    727     444

Another solution with MultiIndex and reshape by stack:

cols = ['char','left','top','right','bottom']
df1 = df.character_position.str.strip('[]').str.split(', ', expand=True)
df1.columns = [df1.columns % 5, df1.columns // 5]
df1 = df1.stack().reset_index(drop=True)
df1.columns = cols
df1[cols[1:]] = df1[cols[1:]].astype(int)
print (df1)
   char  left  top  right  bottom
0   'm'    38  104   2456    2492
1   'i'    40  102   2442     222
2   '.'   203  213    191     198
3   '3'   235  262    131    3333
4   'A'   275  347    147     239
5   'M'   363  465    145    3334
6   'A'    73   91    373     394
7   'D'    93  112    373      39
8   'D'   454  473    663     685
9   'O'   474  495    664      33
10  'A'   108  129    727     751
11  'V'   129  150    727     444

Upvotes: 1

Related Questions