Reputation: 59
Traceback (most recent call last):
File "Users", line 50, in <module>
length = len_c / (len_a_b - len_c)
File "\venv\lib\site-packages\pandas\core\ops\common.py", line 65, in new_method
return method(self, other)
File "\venv\lib\site-packages\pandas\core\arraylike.py", line 97, in __sub__
return self._arith_method(other, operator.sub)
File "\venv\lib\site-packages\pandas\core\series.py", line 4994, in _arith_method
self, other = ops.align_method_SERIES(self, other)
File "\venv\lib\site-packages\pandas\core\ops\__init__.py", line 147, in align_method_SERIES
left, right = left.align(right, copy=False)
File "\lib\site-packages\pandas\core\series.py", line 4220, in align
return super().align(
File "\venv\lib\site-packages\pandas\core\generic.py", line 8825, in alig
return self._align_series(
File "\venv\lib\site-packages\pandas\core\generic.py", line 8934, in _align_series
join_index, lidx, ridx = self.index.join(
File "\venv\lib\site-packages\pandas\core\indexes\range.py", line 690, in join
return self._int64index.join(other, how, level, return_indexers, sort)
File "\venv\lib\site-packages\pandas\core\indexes\base.py", line 3669, in join
return this.join(other, how=how, return_indexers=return_indexers)
File "\venv\lib\site-packages\pandas\core\indexes\base.py", line 3679, in join
return self._join_monotonic(
File "\venv\lib\site-packages\pandas\core\indexe\base.py", line 4014, in _join_monotonic
join_index, lidx, ridx = self._outer_indexer(sv, ov)
File "\venv\lib\site-packages\pandas\core\indexes\base.py", line 219, in _outer_indexer
return libjoin.outer_join_indexer(left, right)
File "pandas\_libs\join.pyx", line 556, in pandas._libs.join.outer_join_indexer
TypeError: '<' not supported between instances of 'int' and 'str'
Process finished with exit code 1
The problem is in the line starting with dict1=
b = df2.apply(set)
a = df1.apply(set)
#print('a', a.columns)
c = pd.concat([b.apply(lambda x : s.intersection(x)) for s in a], axis=1)
len_a_b = b.apply(lambda x : len(x) + len(a))
len_c = c.apply(lambda x : len(x))
dict1 = {'length' : len_c / (len_a_b - len_c) , 'b' : b , 'c' : c}
This is how the dataframes look like:
0 [Tom, eats, pineapple]
1 [Tom, eats, pineapple]
2 [Eva, eats, apple]
3 [Eva, eats, pineapple]
Name: all, dtype: object
0 [Tom, eats, pineapple]
1 [Tom, eats, pineapple]
2 [Eva, eats, apple]
3 [Eva, eats, pineapple]
Name: sentence, dtype: object
print(len_c): Length: 550, dtype: int64
print(len_a_b): Length: 6646, dtype: int64
As you can see after the tokenization we have 100% integers here and still python says it doesn't. The same function worked with the data when it weren't two whole dataframes.
Upvotes: 0
Views: 266
Reputation: 24304
Instead of this:
len_c = c.apply(lambda x : len(x))
Use this:
len_c =c.apply(lambda x : len(x)).reset_index(drop=True)
Finally:
dict1 = {'length' : len_c / (len_a_b - len_c) , 'b' : b , 'c' : c}
Upvotes: 1