Vivek Sable
Vivek Sable

Reputation: 10223

How to add values to row?

How to add values to row :

  1. I created one column in data frame and assign value to 0.
  2. Written logic to update these now column value, but not reflecting.

Input:

>>> parafix_df = main_df[["line_width", "para_num", "bbox" ]]
>>> parafix_df
   line_width para_num                             bbox
0     238.546      NaN  (50.0, 579.3, 288.546, 598.022)
1         318        1    (64.0, 564.9, 382.0, 583.622)
2         332        2    (50.0, 550.5, 382.0, 569.222)
3         332        2    (50.0, 536.1, 382.0, 554.822)
4     328.977        2  (50.0, 521.7, 378.977, 540.422)
5         318        3    (64.0, 507.3, 382.0, 526.022)
6         332        3    (50.0, 492.9, 382.0, 511.622)
7         332        3    (50.0, 478.5, 382.0, 497.222)
8         332        3    (50.0, 464.1, 382.0, 482.822)
9         332        3    (50.0, 449.7, 382.0, 468.422)
10      59.04        3   (50.0, 435.3, 109.04, 454.022)
11    304.007        4  (64.0, 420.9, 368.007, 439.622)
12        318        5    (64.0, 406.5, 382.0, 425.222)
13        332        5    (50.0, 392.1, 382.0, 410.822)
14        332        5    (50.0, 377.7, 382.0, 396.422)
15        332        5    (50.0, 363.3, 382.0, 382.022)
16     43.252        5   (50.0, 348.9, 93.252, 367.622)
17        318        6    (64.0, 334.5, 382.0, 353.222)
18        332        6    (50.0, 320.1, 382.0, 338.822)
19        332        6    (50.0, 305.7, 382.0, 324.422)
20        332        6    (50.0, 291.3, 382.0, 310.022)
21        332        6    (50.0, 276.9, 382.0, 295.622)
22     317.02        6   (50.0, 262.5, 367.02, 281.222)
23        318        7    (64.0, 248.1, 382.0, 266.822)
24        332        7    (50.0, 233.7, 382.0, 252.422)
25     47.014        7   (50.0, 219.3, 97.014, 238.022)
26        318        8    (64.0, 204.9, 382.0, 223.622)
27    316.723        8  (50.0, 190.5, 366.723, 209.222)
28        318        9    (64.0, 176.1, 382.0, 194.822)
29    326.766        9  (50.0, 161.7, 376.766, 180.422)
30        318       10    (64.0, 147.3, 382.0, 166.022)
31        332       10    (50.0, 132.9, 382.0, 151.622)
32        332       10    (50.0, 118.5, 382.0, 137.222)
33    305.393       11  (64.0, 104.1, 369.393, 122.822)
34        318       12     (64.0, 89.7, 382.0, 108.422)
35        318       13      (64.0, 75.3, 382.0, 94.022)
36    319.165       13    (50.0, 60.9, 369.165, 79.622)
37    308.165       14    (64.0, 46.5, 372.165, 65.222)
38        318       15      (64.0, 32.1, 382.0, 50.822)
39    329.153       15    (50.0, 17.7, 379.153, 36.422)
40        318       16       (64.0, 3.3, 382.0, 22.022)
41    324.335       16    (50.0, -11.1, 374.335, 7.622)

Code:

parafix_df = main_df[["line_text", "line_width", "para_num", "bbox" ]]
parafix_df["new_para_num"] = 0

max_width = parafix_df['line_width'].max()
bbox_max_width = parafix_df.loc[selected['line_width'] == max_width].iloc[0]["bbox"]

previous = None
para1 = 1
for current, next in izip(parafix_df.iterrows(), parafix_df.iloc[1:].iterrows()):
    if previous==None:
        current[1]["new_para_num"] = para1
    else:
        bbox_current = current[1]["bbox"]
        bbox_next = next[1]["bbox"]
        bbox_previous = previous[1]["bbox"]
        if bbox_current[0]>bbox_max_width[0]:
            para1 += 1
            print "para1:", para1
        current[1]["new_para_num"] = para1

    previous = current

Output of above code:

                              bbox  new_para_num  
0   (50.0, 579.3, 288.546, 598.022)             0  
1     (64.0, 564.9, 382.0, 583.622)             0  
2     (50.0, 550.5, 382.0, 569.222)             0  
3     (50.0, 536.1, 382.0, 554.822)             0  
4   (50.0, 521.7, 378.977, 540.422)             0  
5     (64.0, 507.3, 382.0, 526.022)             0  
6     (50.0, 492.9, 382.0, 511.622)             0  
7     (50.0, 478.5, 382.0, 497.222)             0  
8     (50.0, 464.1, 382.0, 482.822)             0  
9     (50.0, 449.7, 382.0, 468.422)             0  
10   (50.0, 435.3, 109.04, 454.022)             0  
11  (64.0, 420.9, 368.007, 439.622)             0  
12    (64.0, 406.5, 382.0, 425.222)             0  
13    (50.0, 392.1, 382.0, 410.822)             0  
14    (50.0, 377.7, 382.0, 396.422)             0  
15    (50.0, 363.3, 382.0, 382.022)             0  
16   (50.0, 348.9, 93.252, 367.622)             0  
17    (64.0, 334.5, 382.0, 353.222)             0  
18    (50.0, 320.1, 382.0, 338.822)             0  
19    (50.0, 305.7, 382.0, 324.422)             0  
20    (50.0, 291.3, 382.0, 310.022)             0  
21    (50.0, 276.9, 382.0, 295.622)             0  
22   (50.0, 262.5, 367.02, 281.222)             0  
23    (64.0, 248.1, 382.0, 266.822)             0  
24    (50.0, 233.7, 382.0, 252.422)             0  
25   (50.0, 219.3, 97.014, 238.022)             0  
26    (64.0, 204.9, 382.0, 223.622)             0  
27  (50.0, 190.5, 366.723, 209.222)             0  
28    (64.0, 176.1, 382.0, 194.822)             0  
29  (50.0, 161.7, 376.766, 180.422)             0  
30    (64.0, 147.3, 382.0, 166.022)             0  
31    (50.0, 132.9, 382.0, 151.622)             0  
32    (50.0, 118.5, 382.0, 137.222)             0  
33  (64.0, 104.1, 369.393, 122.822)             0  
34     (64.0, 89.7, 382.0, 108.422)             0  
35      (64.0, 75.3, 382.0, 94.022)             0  
36    (50.0, 60.9, 369.165, 79.622)             0  
37    (64.0, 46.5, 372.165, 65.222)             0  
38      (64.0, 32.1, 382.0, 50.822)             0  
39    (50.0, 17.7, 379.153, 36.422)             0  
40       (64.0, 3.3, 382.0, 22.022)             0  
41    (50.0, -11.1, 374.335, 7.622)             0  

but I want new para values:

para1: 2
para1: 3
para1: 4
para1: 5
para1: 6
para1: 7
para1: 8
para1: 9
para1: 10
para1: 11
para1: 12
para1: 13
para1: 14
para1: 15
para1: 16

Can you help me?

Following is my final working code:

parafix_df = main_df[["line_text", "line_width", "para_num", "bbox" ]]
parafix_df["new_para_num"] = 0

max_width = parafix_df['line_width'].max()
bbox_max_width = parafix_df.loc[selected['line_width'] == max_width].iloc[0]["bbox"]

para1 = 1
for indx, current in enumerate(parafix_df.iterrows(), start=0):
    if indx!=0:
        bbox_current = current[1]["bbox"]
        if bbox_current[0]>bbox_max_width[0]:
            para1 += 1
    parafix_df.iloc[indx, 4] = para1

can we optimized more?

Upvotes: 1

Views: 170

Answers (1)

MaxU - stand with Ukraine
MaxU - stand with Ukraine

Reputation: 210842

UPDATE:

IIUC, you can do it this way:

df.new_para_num = 1

In [210]: df.loc[df.line_width == df.line_width.max(), 'new_para_num'].cumsum() + 1
Out[210]:
2      2
3      3
6      4
7      5
8      6
9      7
13     8
14     9
15    10
18    11
19    12
20    13
21    14
24    15
31    16
32    17
Name: new_para_num, dtype: int64

if you want to update new_para_num column in your original DF conditionally:

In [223]: df.new_para_num = 1

In [224]: selected = df.loc[df.line_width == df.line_width.max()].copy()

In [226]: selected.new_para_num = selected.new_para_num.cumsum() + 1

In [227]: selected
Out[227]:
    line_width  para_num                           bbox  new_para_num
2        332.0       2.0  [50.0, 550.5, 382.0, 569.222]             2
3        332.0       2.0  [50.0, 536.1, 382.0, 554.822]             3
6        332.0       3.0  [50.0, 492.9, 382.0, 511.622]             4
7        332.0       3.0  [50.0, 478.5, 382.0, 497.222]             5
8        332.0       3.0  [50.0, 464.1, 382.0, 482.822]             6
9        332.0       3.0  [50.0, 449.7, 382.0, 468.422]             7
13       332.0       5.0  [50.0, 392.1, 382.0, 410.822]             8
14       332.0       5.0  [50.0, 377.7, 382.0, 396.422]             9
15       332.0       5.0  [50.0, 363.3, 382.0, 382.022]            10
18       332.0       6.0  [50.0, 320.1, 382.0, 338.822]            11
19       332.0       6.0  [50.0, 305.7, 382.0, 324.422]            12
20       332.0       6.0  [50.0, 291.3, 382.0, 310.022]            13
21       332.0       6.0  [50.0, 276.9, 382.0, 295.622]            14
24       332.0       7.0  [50.0, 233.7, 382.0, 252.422]            15
31       332.0      10.0  [50.0, 132.9, 382.0, 151.622]            16
32       332.0      10.0  [50.0, 118.5, 382.0, 137.222]            17

In [228]: df.loc[df.line_width == df.line_width.max(), 'new_para_num'] = selected

In [229]: df
Out[229]:
    line_width  para_num                             bbox  new_para_num
0      238.546       NaN  [50.0, 579.3, 288.546, 598.022]             1
1      318.000       1.0    [64.0, 564.9, 382.0, 583.622]             1
2      332.000       2.0    [50.0, 550.5, 382.0, 569.222]             2
3      332.000       2.0    [50.0, 536.1, 382.0, 554.822]             3
4      328.977       2.0  [50.0, 521.7, 378.977, 540.422]             1
5      318.000       3.0    [64.0, 507.3, 382.0, 526.022]             1
6      332.000       3.0    [50.0, 492.9, 382.0, 511.622]             4
7      332.000       3.0    [50.0, 478.5, 382.0, 497.222]             5
8      332.000       3.0    [50.0, 464.1, 382.0, 482.822]             6
9      332.000       3.0    [50.0, 449.7, 382.0, 468.422]             7
10      59.040       3.0   [50.0, 435.3, 109.04, 454.022]             1
11     304.007       4.0  [64.0, 420.9, 368.007, 439.622]             1
12     318.000       5.0    [64.0, 406.5, 382.0, 425.222]             1
13     332.000       5.0    [50.0, 392.1, 382.0, 410.822]             8
14     332.000       5.0    [50.0, 377.7, 382.0, 396.422]             9
15     332.000       5.0    [50.0, 363.3, 382.0, 382.022]            10
16      43.252       5.0   [50.0, 348.9, 93.252, 367.622]             1
17     318.000       6.0    [64.0, 334.5, 382.0, 353.222]             1
18     332.000       6.0    [50.0, 320.1, 382.0, 338.822]            11
19     332.000       6.0    [50.0, 305.7, 382.0, 324.422]            12
20     332.000       6.0    [50.0, 291.3, 382.0, 310.022]            13
21     332.000       6.0    [50.0, 276.9, 382.0, 295.622]            14
22     317.020       6.0   [50.0, 262.5, 367.02, 281.222]             1
23     318.000       7.0    [64.0, 248.1, 382.0, 266.822]             1
24     332.000       7.0    [50.0, 233.7, 382.0, 252.422]            15
25      47.014       7.0   [50.0, 219.3, 97.014, 238.022]             1
26     318.000       8.0    [64.0, 204.9, 382.0, 223.622]             1
27     316.723       8.0  [50.0, 190.5, 366.723, 209.222]             1
28     318.000       9.0    [64.0, 176.1, 382.0, 194.822]             1
29     326.766       9.0  [50.0, 161.7, 376.766, 180.422]             1
30     318.000      10.0    [64.0, 147.3, 382.0, 166.022]             1
31     332.000      10.0    [50.0, 132.9, 382.0, 151.622]            16
32     332.000      10.0    [50.0, 118.5, 382.0, 137.222]            17
33     305.393      11.0  [64.0, 104.1, 369.393, 122.822]             1
34     318.000      12.0     [64.0, 89.7, 382.0, 108.422]             1
35     318.000      13.0      [64.0, 75.3, 382.0, 94.022]             1
36     319.165      13.0    [50.0, 60.9, 369.165, 79.622]             1
37     308.165      14.0    [64.0, 46.5, 372.165, 65.222]             1
38     318.000      15.0      [64.0, 32.1, 382.0, 50.822]             1
39     329.153      15.0    [50.0, 17.7, 379.153, 36.422]             1
40     318.000      16.0       [64.0, 3.3, 382.0, 22.022]             1
41     324.335      16.0    [50.0, -11.1, 374.335, 7.622]             1

PS but i'm still not sure that i understood correctly your goals

OLD answer:

you can use shift function in order to access previous and next rows:

df.shift(-1)  # df will be shifted one row backwards (will show `next` row) 

df.shift(1)  # df will be shifted one row forwards (will show `prev` row)

Example:

In [142]: df
Out[142]:
   a  b  c
0  8  3  0
1  8  3  4
2  9  4  1
3  2  1  8
4  5  6  3

In [147]: df['prev_a'] = df.a.shift(1)

In [148]: df['next_a'] = df.a.shift(-1)

In [149]: df
Out[149]:
   a  b  c  prev_a  next_a
0  8  3  0     NaN     8.0
1  8  3  4     8.0     9.0
2  9  4  1     8.0     2.0
3  2  1  8     9.0     5.0
4  5  6  3     2.0     NaN

Upvotes: 1

Related Questions