Reputation: 75
I have a dataframe like this:
name | cluster |
---|---|
'sock' | 1 |
'graceful' | 2 |
'disgrace' | 2 |
'fixture' | 3 |
'winnow' | 4 |
'window' | 4 |
and a function common_substring
that takes in two strings and returns the longest common substring.
I want to return
name | cluster |
---|---|
'sock' | 1 |
'grace' | 2 |
'fixture' | 3 |
'win' | 4 |
I know I'm supposed to group by cluster, but I'm struggling with how to aggregate and apply a function that takes two values.
Upvotes: 0
Views: 969
Reputation: 862761
Use GroupBy.agg
with aggregate function for pass function if length of group is not 1
, then pass all values in column in list:
#https://stackoverflow.com/questions/40556491
from functools import partial, reduce
from itertools import chain
from typing import Iterator
def ngram(seq: str, n: int) -> Iterator[str]:
return (seq[i: i+n] for i in range(0, len(seq)-n+1))
def allngram(seq: str) -> set:
lengths = range(len(seq)+1)
ngrams = map(partial(ngram, seq), lengths)
return set(chain.from_iterable(ngrams))
def func(x):
seqs_ngrams = map(allngram, x.tolist())
intersection = reduce(set.intersection, seqs_ngrams)
longest = max(intersection, key=len)
return longest
df['name'] = df['name'].str.strip("'")
df = df.groupby('cluster')['name'].agg(func).reset_index()
print (df)
cluster name
0 1 sock
1 2 grace
2 3 fixture
3 4 win
Upvotes: 1
Reputation: 4543
Based on the @jezrael and the link:
from functools import partial, reduce
from itertools import chain
from typing import Iterator
def ngram(seq: str, n: int) -> Iterator[str]:
return (seq[i: i+n] for i in range(0, len(seq)-n+1))
def allngram(seq: str) -> set:
lengths = range(len(seq))
ngrams = map(partial(ngram, seq), lengths)
return set(chain.from_iterable(ngrams))
def func(a, b):
seqs_ngrams = map(allngram, [a,b])
intersection = reduce(set.intersection, seqs_ngrams)
longest = max(intersection, key=len)
return longest
def f(x):
try:
return func(x.iat[0], x.iat[1])
except:
return 'Unique value'
df = df.groupby('cluster')['name'].agg(f).reset_index()
print (df)
Demonstration:
df = pd.DataFrame({'name':['winnow', 'window'], 'cluster':[4,4]})
df = df.groupby('cluster')['name'].agg(f).reset_index()
output:
For, any length input:
from functools import partial, reduce
from itertools import chain
from typing import Iterator
def ngram(seq: str, n: int) -> Iterator[str]:
return (seq[i: i+n] for i in range(0, len(seq)-n+1))
def allngram(seq: str) -> set:
lengths = range(len(seq)+1)
ngrams = map(partial(ngram, seq), lengths)
return set(chain.from_iterable(ngrams))
def func(seq):
seq=seq.values
seqs_ngrams = map(allngram, seq)
intersection = reduce(set.intersection, seqs_ngrams)
longest = max(intersection, key=len)
return longest
def f(x):
return func(x)
df = pd.DataFrame({'name':['winnow', 'window', 'win', 'sock', 'graceful', 'disgrace','fixture'], 'cluster':[4,4,4,1,2,2,3]})
df = df.groupby('cluster')['name'].agg(f).reset_index()
Ooutput:
Upvotes: 1