Oprah W.
Oprah W.

Reputation: 75

How to aggregate two rows with the same value in another column?

I have a dataframe like this:

name cluster
'sock' 1
'graceful' 2
'disgrace' 2
'fixture' 3
'winnow' 4
'window' 4

and a function common_substring that takes in two strings and returns the longest common substring.

I want to return

name cluster
'sock' 1
'grace' 2
'fixture' 3
'win' 4

I know I'm supposed to group by cluster, but I'm struggling with how to aggregate and apply a function that takes two values.

Upvotes: 0

Views: 969

Answers (2)

jezrael
jezrael

Reputation: 862761

Use GroupBy.agg with aggregate function for pass function if length of group is not 1, then pass all values in column in list:

#https://stackoverflow.com/questions/40556491
from functools import partial, reduce
from itertools import chain
from typing import Iterator

def ngram(seq: str, n: int) -> Iterator[str]:
    return (seq[i: i+n] for i in range(0, len(seq)-n+1))

def allngram(seq: str) -> set:
    lengths = range(len(seq)+1)
    ngrams = map(partial(ngram, seq), lengths)
    return set(chain.from_iterable(ngrams))

def func(x):
    seqs_ngrams = map(allngram, x.tolist())
    intersection = reduce(set.intersection, seqs_ngrams)
    longest = max(intersection, key=len)
    return longest

df['name'] = df['name'].str.strip("'")

df = df.groupby('cluster')['name'].agg(func).reset_index()
print (df)
   cluster     name
0        1     sock
1        2    grace
2        3  fixture
3        4      win

Upvotes: 1

keramat
keramat

Reputation: 4543

Based on the @jezrael and the link:

from functools import partial, reduce
from itertools import chain
from typing import Iterator


def ngram(seq: str, n: int) -> Iterator[str]:
    return (seq[i: i+n] for i in range(0, len(seq)-n+1))


def allngram(seq: str) -> set:
    lengths = range(len(seq))
    ngrams = map(partial(ngram, seq), lengths)
    return set(chain.from_iterable(ngrams))


def func(a, b):
    seqs_ngrams = map(allngram, [a,b])
    intersection = reduce(set.intersection, seqs_ngrams)
    longest = max(intersection, key=len)
    return longest

def f(x):
    try:
        return func(x.iat[0], x.iat[1])
    except:
        return 'Unique value'

df = df.groupby('cluster')['name'].agg(f).reset_index()
print (df)

Demonstration:

df = pd.DataFrame({'name':['winnow', 'window'], 'cluster':[4,4]})
df = df.groupby('cluster')['name'].agg(f).reset_index()

output:

enter image description here

For, any length input:

from functools import partial, reduce
from itertools import chain
from typing import Iterator


def ngram(seq: str, n: int) -> Iterator[str]:
    return (seq[i: i+n] for i in range(0, len(seq)-n+1))


def allngram(seq: str) -> set:
    lengths = range(len(seq)+1)
    ngrams = map(partial(ngram, seq), lengths)
    return set(chain.from_iterable(ngrams))


def func(seq):
    seq=seq.values
    seqs_ngrams = map(allngram, seq)
    intersection = reduce(set.intersection, seqs_ngrams)
    
    longest = max(intersection, key=len)
    return longest

def f(x):
    return func(x)
df = pd.DataFrame({'name':['winnow', 'window', 'win', 'sock', 'graceful', 'disgrace','fixture'], 'cluster':[4,4,4,1,2,2,3]})

df = df.groupby('cluster')['name'].agg(f).reset_index()

Ooutput:

enter image description here

Upvotes: 1

Related Questions