Reputation: 10223
How to add values to row :
Input:
>>> parafix_df = main_df[["line_width", "para_num", "bbox" ]]
>>> parafix_df
line_width para_num bbox
0 238.546 NaN (50.0, 579.3, 288.546, 598.022)
1 318 1 (64.0, 564.9, 382.0, 583.622)
2 332 2 (50.0, 550.5, 382.0, 569.222)
3 332 2 (50.0, 536.1, 382.0, 554.822)
4 328.977 2 (50.0, 521.7, 378.977, 540.422)
5 318 3 (64.0, 507.3, 382.0, 526.022)
6 332 3 (50.0, 492.9, 382.0, 511.622)
7 332 3 (50.0, 478.5, 382.0, 497.222)
8 332 3 (50.0, 464.1, 382.0, 482.822)
9 332 3 (50.0, 449.7, 382.0, 468.422)
10 59.04 3 (50.0, 435.3, 109.04, 454.022)
11 304.007 4 (64.0, 420.9, 368.007, 439.622)
12 318 5 (64.0, 406.5, 382.0, 425.222)
13 332 5 (50.0, 392.1, 382.0, 410.822)
14 332 5 (50.0, 377.7, 382.0, 396.422)
15 332 5 (50.0, 363.3, 382.0, 382.022)
16 43.252 5 (50.0, 348.9, 93.252, 367.622)
17 318 6 (64.0, 334.5, 382.0, 353.222)
18 332 6 (50.0, 320.1, 382.0, 338.822)
19 332 6 (50.0, 305.7, 382.0, 324.422)
20 332 6 (50.0, 291.3, 382.0, 310.022)
21 332 6 (50.0, 276.9, 382.0, 295.622)
22 317.02 6 (50.0, 262.5, 367.02, 281.222)
23 318 7 (64.0, 248.1, 382.0, 266.822)
24 332 7 (50.0, 233.7, 382.0, 252.422)
25 47.014 7 (50.0, 219.3, 97.014, 238.022)
26 318 8 (64.0, 204.9, 382.0, 223.622)
27 316.723 8 (50.0, 190.5, 366.723, 209.222)
28 318 9 (64.0, 176.1, 382.0, 194.822)
29 326.766 9 (50.0, 161.7, 376.766, 180.422)
30 318 10 (64.0, 147.3, 382.0, 166.022)
31 332 10 (50.0, 132.9, 382.0, 151.622)
32 332 10 (50.0, 118.5, 382.0, 137.222)
33 305.393 11 (64.0, 104.1, 369.393, 122.822)
34 318 12 (64.0, 89.7, 382.0, 108.422)
35 318 13 (64.0, 75.3, 382.0, 94.022)
36 319.165 13 (50.0, 60.9, 369.165, 79.622)
37 308.165 14 (64.0, 46.5, 372.165, 65.222)
38 318 15 (64.0, 32.1, 382.0, 50.822)
39 329.153 15 (50.0, 17.7, 379.153, 36.422)
40 318 16 (64.0, 3.3, 382.0, 22.022)
41 324.335 16 (50.0, -11.1, 374.335, 7.622)
Code:
parafix_df = main_df[["line_text", "line_width", "para_num", "bbox" ]]
parafix_df["new_para_num"] = 0
max_width = parafix_df['line_width'].max()
bbox_max_width = parafix_df.loc[selected['line_width'] == max_width].iloc[0]["bbox"]
previous = None
para1 = 1
for current, next in izip(parafix_df.iterrows(), parafix_df.iloc[1:].iterrows()):
if previous==None:
current[1]["new_para_num"] = para1
else:
bbox_current = current[1]["bbox"]
bbox_next = next[1]["bbox"]
bbox_previous = previous[1]["bbox"]
if bbox_current[0]>bbox_max_width[0]:
para1 += 1
print "para1:", para1
current[1]["new_para_num"] = para1
previous = current
Output of above code:
bbox new_para_num
0 (50.0, 579.3, 288.546, 598.022) 0
1 (64.0, 564.9, 382.0, 583.622) 0
2 (50.0, 550.5, 382.0, 569.222) 0
3 (50.0, 536.1, 382.0, 554.822) 0
4 (50.0, 521.7, 378.977, 540.422) 0
5 (64.0, 507.3, 382.0, 526.022) 0
6 (50.0, 492.9, 382.0, 511.622) 0
7 (50.0, 478.5, 382.0, 497.222) 0
8 (50.0, 464.1, 382.0, 482.822) 0
9 (50.0, 449.7, 382.0, 468.422) 0
10 (50.0, 435.3, 109.04, 454.022) 0
11 (64.0, 420.9, 368.007, 439.622) 0
12 (64.0, 406.5, 382.0, 425.222) 0
13 (50.0, 392.1, 382.0, 410.822) 0
14 (50.0, 377.7, 382.0, 396.422) 0
15 (50.0, 363.3, 382.0, 382.022) 0
16 (50.0, 348.9, 93.252, 367.622) 0
17 (64.0, 334.5, 382.0, 353.222) 0
18 (50.0, 320.1, 382.0, 338.822) 0
19 (50.0, 305.7, 382.0, 324.422) 0
20 (50.0, 291.3, 382.0, 310.022) 0
21 (50.0, 276.9, 382.0, 295.622) 0
22 (50.0, 262.5, 367.02, 281.222) 0
23 (64.0, 248.1, 382.0, 266.822) 0
24 (50.0, 233.7, 382.0, 252.422) 0
25 (50.0, 219.3, 97.014, 238.022) 0
26 (64.0, 204.9, 382.0, 223.622) 0
27 (50.0, 190.5, 366.723, 209.222) 0
28 (64.0, 176.1, 382.0, 194.822) 0
29 (50.0, 161.7, 376.766, 180.422) 0
30 (64.0, 147.3, 382.0, 166.022) 0
31 (50.0, 132.9, 382.0, 151.622) 0
32 (50.0, 118.5, 382.0, 137.222) 0
33 (64.0, 104.1, 369.393, 122.822) 0
34 (64.0, 89.7, 382.0, 108.422) 0
35 (64.0, 75.3, 382.0, 94.022) 0
36 (50.0, 60.9, 369.165, 79.622) 0
37 (64.0, 46.5, 372.165, 65.222) 0
38 (64.0, 32.1, 382.0, 50.822) 0
39 (50.0, 17.7, 379.153, 36.422) 0
40 (64.0, 3.3, 382.0, 22.022) 0
41 (50.0, -11.1, 374.335, 7.622) 0
but I want new para values:
para1: 2
para1: 3
para1: 4
para1: 5
para1: 6
para1: 7
para1: 8
para1: 9
para1: 10
para1: 11
para1: 12
para1: 13
para1: 14
para1: 15
para1: 16
Can you help me?
Following is my final working code:
parafix_df = main_df[["line_text", "line_width", "para_num", "bbox" ]]
parafix_df["new_para_num"] = 0
max_width = parafix_df['line_width'].max()
bbox_max_width = parafix_df.loc[selected['line_width'] == max_width].iloc[0]["bbox"]
para1 = 1
for indx, current in enumerate(parafix_df.iterrows(), start=0):
if indx!=0:
bbox_current = current[1]["bbox"]
if bbox_current[0]>bbox_max_width[0]:
para1 += 1
parafix_df.iloc[indx, 4] = para1
can we optimized more?
Upvotes: 1
Views: 170
Reputation: 210842
UPDATE:
IIUC, you can do it this way:
df.new_para_num = 1
In [210]: df.loc[df.line_width == df.line_width.max(), 'new_para_num'].cumsum() + 1
Out[210]:
2 2
3 3
6 4
7 5
8 6
9 7
13 8
14 9
15 10
18 11
19 12
20 13
21 14
24 15
31 16
32 17
Name: new_para_num, dtype: int64
if you want to update new_para_num
column in your original DF conditionally:
In [223]: df.new_para_num = 1
In [224]: selected = df.loc[df.line_width == df.line_width.max()].copy()
In [226]: selected.new_para_num = selected.new_para_num.cumsum() + 1
In [227]: selected
Out[227]:
line_width para_num bbox new_para_num
2 332.0 2.0 [50.0, 550.5, 382.0, 569.222] 2
3 332.0 2.0 [50.0, 536.1, 382.0, 554.822] 3
6 332.0 3.0 [50.0, 492.9, 382.0, 511.622] 4
7 332.0 3.0 [50.0, 478.5, 382.0, 497.222] 5
8 332.0 3.0 [50.0, 464.1, 382.0, 482.822] 6
9 332.0 3.0 [50.0, 449.7, 382.0, 468.422] 7
13 332.0 5.0 [50.0, 392.1, 382.0, 410.822] 8
14 332.0 5.0 [50.0, 377.7, 382.0, 396.422] 9
15 332.0 5.0 [50.0, 363.3, 382.0, 382.022] 10
18 332.0 6.0 [50.0, 320.1, 382.0, 338.822] 11
19 332.0 6.0 [50.0, 305.7, 382.0, 324.422] 12
20 332.0 6.0 [50.0, 291.3, 382.0, 310.022] 13
21 332.0 6.0 [50.0, 276.9, 382.0, 295.622] 14
24 332.0 7.0 [50.0, 233.7, 382.0, 252.422] 15
31 332.0 10.0 [50.0, 132.9, 382.0, 151.622] 16
32 332.0 10.0 [50.0, 118.5, 382.0, 137.222] 17
In [228]: df.loc[df.line_width == df.line_width.max(), 'new_para_num'] = selected
In [229]: df
Out[229]:
line_width para_num bbox new_para_num
0 238.546 NaN [50.0, 579.3, 288.546, 598.022] 1
1 318.000 1.0 [64.0, 564.9, 382.0, 583.622] 1
2 332.000 2.0 [50.0, 550.5, 382.0, 569.222] 2
3 332.000 2.0 [50.0, 536.1, 382.0, 554.822] 3
4 328.977 2.0 [50.0, 521.7, 378.977, 540.422] 1
5 318.000 3.0 [64.0, 507.3, 382.0, 526.022] 1
6 332.000 3.0 [50.0, 492.9, 382.0, 511.622] 4
7 332.000 3.0 [50.0, 478.5, 382.0, 497.222] 5
8 332.000 3.0 [50.0, 464.1, 382.0, 482.822] 6
9 332.000 3.0 [50.0, 449.7, 382.0, 468.422] 7
10 59.040 3.0 [50.0, 435.3, 109.04, 454.022] 1
11 304.007 4.0 [64.0, 420.9, 368.007, 439.622] 1
12 318.000 5.0 [64.0, 406.5, 382.0, 425.222] 1
13 332.000 5.0 [50.0, 392.1, 382.0, 410.822] 8
14 332.000 5.0 [50.0, 377.7, 382.0, 396.422] 9
15 332.000 5.0 [50.0, 363.3, 382.0, 382.022] 10
16 43.252 5.0 [50.0, 348.9, 93.252, 367.622] 1
17 318.000 6.0 [64.0, 334.5, 382.0, 353.222] 1
18 332.000 6.0 [50.0, 320.1, 382.0, 338.822] 11
19 332.000 6.0 [50.0, 305.7, 382.0, 324.422] 12
20 332.000 6.0 [50.0, 291.3, 382.0, 310.022] 13
21 332.000 6.0 [50.0, 276.9, 382.0, 295.622] 14
22 317.020 6.0 [50.0, 262.5, 367.02, 281.222] 1
23 318.000 7.0 [64.0, 248.1, 382.0, 266.822] 1
24 332.000 7.0 [50.0, 233.7, 382.0, 252.422] 15
25 47.014 7.0 [50.0, 219.3, 97.014, 238.022] 1
26 318.000 8.0 [64.0, 204.9, 382.0, 223.622] 1
27 316.723 8.0 [50.0, 190.5, 366.723, 209.222] 1
28 318.000 9.0 [64.0, 176.1, 382.0, 194.822] 1
29 326.766 9.0 [50.0, 161.7, 376.766, 180.422] 1
30 318.000 10.0 [64.0, 147.3, 382.0, 166.022] 1
31 332.000 10.0 [50.0, 132.9, 382.0, 151.622] 16
32 332.000 10.0 [50.0, 118.5, 382.0, 137.222] 17
33 305.393 11.0 [64.0, 104.1, 369.393, 122.822] 1
34 318.000 12.0 [64.0, 89.7, 382.0, 108.422] 1
35 318.000 13.0 [64.0, 75.3, 382.0, 94.022] 1
36 319.165 13.0 [50.0, 60.9, 369.165, 79.622] 1
37 308.165 14.0 [64.0, 46.5, 372.165, 65.222] 1
38 318.000 15.0 [64.0, 32.1, 382.0, 50.822] 1
39 329.153 15.0 [50.0, 17.7, 379.153, 36.422] 1
40 318.000 16.0 [64.0, 3.3, 382.0, 22.022] 1
41 324.335 16.0 [50.0, -11.1, 374.335, 7.622] 1
PS but i'm still not sure that i understood correctly your goals
OLD answer:
you can use shift function in order to access previous and next rows:
df.shift(-1) # df will be shifted one row backwards (will show `next` row)
df.shift(1) # df will be shifted one row forwards (will show `prev` row)
Example:
In [142]: df
Out[142]:
a b c
0 8 3 0
1 8 3 4
2 9 4 1
3 2 1 8
4 5 6 3
In [147]: df['prev_a'] = df.a.shift(1)
In [148]: df['next_a'] = df.a.shift(-1)
In [149]: df
Out[149]:
a b c prev_a next_a
0 8 3 0 NaN 8.0
1 8 3 4 8.0 9.0
2 9 4 1 8.0 2.0
3 2 1 8 9.0 5.0
4 5 6 3 2.0 NaN
Upvotes: 1