get the index of previous occurrence of a value in a pandas Series

Question

Let's say I have a dataframe

>> df = pd.DataFrame({'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C', 'B', 'A', 'C', 'C', 'B' ]})
    code
0   A
1   A
2   B
3   B
4   C
5   C
6   C
7   B
8   A
9   C
10  C
11  B

I want to create another column which gives the items in code before the next consecutive occurrence of the same value, as well as the target index of the previous occurrence. For example:

    code target prev_idx
0   A [] NaN
1   A [] 0
2   B [] NaN
3   B [] 2
4   C [] NaN
5   C [] 4
6   C [] 5
7   B [C, C, C] 3
8   A [B, B, C, C, C, B] 1
9   C [B, A] 6
10  C [] 9
11  B [A, C, C] 7

What's the fastest way to go about this?

irene · Accepted Answer

Found another way. It's faster than the answer posted by @tozCSS for my actual application (thousands of rows), probably because we don't search the entire dataframe every time (target = (df.code[:row.name]==row.code)). As a bonus, it also works even if the indexing is not from 0 to len(df).

def get_target(row, data, col):
    if not row['first_of_seq']:
        return []
    else:
        return data.iloc[data.index.get_loc(row['prev_idx'] + 1): data.index.get_loc(row.name)][col].tolist()

def get_elements_in_between(data, col):
    data = data.copy()

    # get the index of the previous occurrence of the item
    old_idx_name = data.index.name
    data = data.rename_axis('dummy_idx').reset_index()
    data['prev_idx'] = data.groupby(col)['dummy_idx'].shift()

    # check if the item is the first of consecutive occurrences or not
    data['first_of_seq'] = ((data['prev_idx'] - data['dummy_idx']) != -1) & (~data['prev_idx'].isna())

    # convert floats to integers (and NaN to )
    data['prev_idx'] = data['prev_idx'].astype('Int64')

    # revert to the original indexing
    data = data.set_index('dummy_idx')
    data = data.rename_axis(old_idx_name)

    # get the targets using the target indices
    data['target'] = data.apply(get_target, args=(data, col), axis=1)
    
    data.drop(columns=['first_of_seq'], inplace=True)
    
    return data

We get:

>> get_elements_in_between(df, 'code')
    code    prev_idx    target
0   A       []
1   A   0   []
2   B       []
3   B   2   []
4   C       []
5   C   4   []
6   C   5   []
7   B   3   [C, C, C]
8   A   1   [B, B, C, C, C, B]
9   C   6   [B, A]
10  C   9   []
11  B   7   [A, C, C]

get the index of previous occurrence of a value in a pandas Series

Answers (2)

Related Questions

	code	target	prev_idx
0	A	[]	-1
1	A	[]	0
2	B	[]	1
3	B	[]	2
4	C	[]	3
5	C	[]	4
6	C	[]	5
7	B	['C', 'C', 'C']	3
8	A	['B', 'B', 'C', 'C', 'C', 'B']	1
9	C	['B', 'A']	6
10	C	[]	9
11	B	['A', 'C', 'C']	7