lokalhangatt
lokalhangatt

Reputation: 79

(ERROR) Select one object and all float & int in pandas groupby

I have this dataframe.

import pandas as pd

x = {
  "year": ["2012", "2012", "2013", "2014", "2012", "2014", "2013", "2013", "2012", "2013", "2012", "2014", "2014", "2013", "2012", "2014"],
  "class": ["A", "B", "C", "A", "C", "B", "B", "C", "A", "C", "B", "C", "A", "C", "B", "A"],
  "gender": ["M", "F", "F", "M", "F", "M", "M", "F", "F", "F", "M", "M", "F", "M", "F", "F"],
  "score1": ["6", "6", "8", "10", "6", "7", "6", "7", "8", "7", "10", "9", "9", "9", "8", "9"],
  "score2": ["5", "9", "10", "5", "10", "9", "5", "7", "8", "9", "8", "8", "5", "5", "8", "5"],
  "score3": ["5", "9", "9", "7", "8", "5", "9", "5", "7", "6", "5", "10", "8", "8", "6", "8"],
  "score4": ["10", "8", "8", "10", "9", "8", "10", "9", "7", "8", "10", "9", "7", "7", "10", "7"]
}

data = pd.DataFrame(x)

enter image description here

I want to find the median on every column with dtypes = 'int64'. Then I do groupby class columns on my df.

data.groupby('class').median()

But it shows an error on it.

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1490, in GroupBy._cython_agg_general..array_func(values)
   1489 try:
-> 1490     result = self.grouper._cython_operation(
   1491         "aggregate",
   1492         values,
   1493         how,
   1494         axis=data.ndim - 1,
   1495         min_count=min_count,
   1496         **kwargs,
   1497     )
   1498 except NotImplementedError:
   1499     # generally if we have numeric_only=False
   1500     # and non-applicable functions
   1501     # try to python agg
   1502     # TODO: shouldn't min_count matter?

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:959, in BaseGrouper._cython_operation(self, kind, values, how, axis, min_count, **kwargs)
    958 ngroups = self.ngroups
--> 959 return cy_op.cython_operation(
    960     values=values,
    961     axis=axis,
    962     min_count=min_count,
    963     comp_ids=ids,
    964     ngroups=ngroups,
    965     **kwargs,
    966 )

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:657, in WrappedCythonOp.cython_operation(self, values, axis, min_count, comp_ids, ngroups, **kwargs)
    649     return self._ea_wrap_cython_operation(
    650         values,
    651         min_count=min_count,
   (...)
    654         **kwargs,
    655     )
--> 657 return self._cython_op_ndim_compat(
    658     values,
    659     min_count=min_count,
    660     ngroups=ngroups,
    661     comp_ids=comp_ids,
    662     mask=None,
    663     **kwargs,
    664 )

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:497, in WrappedCythonOp._cython_op_ndim_compat(self, values, min_count, ngroups, comp_ids, mask, result_mask, **kwargs)
    495     return res.T
--> 497 return self._call_cython_op(
    498     values,
    499     min_count=min_count,
    500     ngroups=ngroups,
    501     comp_ids=comp_ids,
    502     mask=mask,
    503     result_mask=result_mask,
    504     **kwargs,
    505 )

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:541, in WrappedCythonOp._call_cython_op(self, values, min_count, ngroups, comp_ids, mask, result_mask, **kwargs)
    540 out_shape = self._get_output_shape(ngroups, values)
--> 541 func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric)
    542 values = self._get_cython_vals(values)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:167, in WrappedCythonOp._get_cython_function(cls, kind, how, dtype, is_numeric)
    165 if how in ["median", "cumprod"]:
    166     # no fused types -> no __signatures__
--> 167     raise NotImplementedError(
    168         f"function is not implemented for this dtype: "
    169         f"[how->{how},dtype->{dtype_str}]"
    170     )
    171 if "object" not in f.__signatures__:
    172     # raise NotImplementedError here rather than TypeError later

NotImplementedError: function is not implemented for this dtype: [how->median,dtype->object]

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\nanops.py:786, in nanmedian(values, axis, skipna, mask)
    785 try:
--> 786     values = values.astype("f8")
    787 except ValueError as err:
    788     # e.g. "could not convert string to float: 'a'"

ValueError: could not convert string to float: 'M'

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[135], line 1
----> 1 data.groupby('class').median()

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1883, in GroupBy.median(self, numeric_only)
   1862 @final
   1863 def median(self, numeric_only: bool = False):
   1864     """
   1865     Compute median of groups, excluding missing values.
   1866 
   (...)
   1881         Median of values within each group.
   1882     """
-> 1883     result = self._cython_agg_general(
   1884         "median",
   1885         alt=lambda x: Series(x).median(numeric_only=numeric_only),
   1886         numeric_only=numeric_only,
   1887     )
   1888     return result.__finalize__(self.obj, method="groupby")

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1507, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
   1503         result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
   1505     return result
-> 1507 new_mgr = data.grouped_reduce(array_func)
   1508 res = self._wrap_agged_manager(new_mgr)
   1509 out = self._wrap_aggregated_output(res)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1503, in BlockManager.grouped_reduce(self, func)
   1499 if blk.is_object:
   1500     # split on object-dtype blocks bc some columns may raise
   1501     #  while others do not.
   1502     for sb in blk._split():
-> 1503         applied = sb.apply(func)
   1504         result_blocks = extend_blocks(applied, result_blocks)
   1505 else:

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\internals\blocks.py:329, in Block.apply(self, func, **kwargs)
    323 @final
    324 def apply(self, func, **kwargs) -> list[Block]:
    325     """
    326     apply the function to my values; return a block if we are not
    327     one
    328     """
--> 329     result = func(self.values, **kwargs)
    331     return self._split_op_result(result)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1503, in GroupBy._cython_agg_general..array_func(values)
   1490     result = self.grouper._cython_operation(
   1491         "aggregate",
   1492         values,
   (...)
   1496         **kwargs,
   1497     )
   1498 except NotImplementedError:
   1499     # generally if we have numeric_only=False
   1500     # and non-applicable functions
   1501     # try to python agg
   1502     # TODO: shouldn't min_count matter?
-> 1503     result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
   1505 return result

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1457, in GroupBy._agg_py_fallback(self, values, ndim, alt)
   1452     ser = df.iloc[:, 0]
   1454 # We do not get here with UDFs, so we know that our dtype
   1455 #  should always be preserved by the implemented aggregations
   1456 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
-> 1457 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
   1459 if isinstance(values, Categorical):
   1460     # Because we only get here with known dtype-preserving
   1461     #  reductions, we cast back to Categorical.
   1462     # TODO: if we ever get "rank" working, exclude it here.
   1463     res_values = type(values)._from_sequence(res_values, dtype=values.dtype)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:994, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
    987 if len(obj) > 0 and not isinstance(obj._values, np.ndarray):
    988     # we can preserve a little bit more aggressively with EA dtype
    989     #  because maybe_cast_pointwise_result will do a try/except
    990     #  with _from_sequence.  NB we are assuming here that _from_sequence
    991     #  is sufficiently strict that it casts appropriately.
    992     preserve_dtype = True
--> 994 result = self._aggregate_series_pure_python(obj, func)
    996 npvalues = lib.maybe_convert_objects(result, try_float=False)
    997 if preserve_dtype:

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:1015, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
   1012 splitter = self._get_splitter(obj, axis=0)
   1014 for i, group in enumerate(splitter):
-> 1015     res = func(group)
   1016     res = libreduction.extract_result(res)
   1018     if not initialized:
   1019         # We only do this validation on the first iteration

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1885, in GroupBy.median..(x)
   1862 @final
   1863 def median(self, numeric_only: bool = False):
   1864     """
   1865     Compute median of groups, excluding missing values.
   1866 
   (...)
   1881         Median of values within each group.
   1882     """
   1883     result = self._cython_agg_general(
   1884         "median",
-> 1885         alt=lambda x: Series(x).median(numeric_only=numeric_only),
   1886         numeric_only=numeric_only,
   1887     )
   1888     return result.__finalize__(self.obj, method="groupby")

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py:11623, in NDFrame._add_numeric_operations..median(self, axis, skipna, numeric_only, **kwargs)
  11606 @doc(
  11607     _num_doc,
  11608     desc="Return the median of the values over the requested axis.",
   (...)
  11621     **kwargs,
  11622 ):
> 11623     return NDFrame.median(self, axis, skipna, numeric_only, **kwargs)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py:11212, in NDFrame.median(self, axis, skipna, numeric_only, **kwargs)
  11205 def median(
  11206     self,
  11207     axis: Axis | None = 0,
   (...)
  11210     **kwargs,
  11211 ) -> Series | float:
> 11212     return self._stat_function(
  11213         "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs
  11214     )

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py:11158, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11154     nv.validate_stat_func((), kwargs, fname=name)
  11156 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11158 return self._reduce(
  11159     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11160 )

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\series.py:4670, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   4665     raise TypeError(
   4666         f"Series.{name} does not allow {kwd_name}={numeric_only} "
   4667         "with non-numeric dtypes."
   4668     )
   4669 with np.errstate(all="ignore"):
-> 4670     return op(delegate, skipna=skipna, **kwds)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\nanops.py:158, in bottleneck_switch.__call__..f(values, axis, skipna, **kwds)
    156         result = alt(values, axis=axis, skipna=skipna, **kwds)
    157 else:
--> 158     result = alt(values, axis=axis, skipna=skipna, **kwds)
    160 return result

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\nanops.py:789, in nanmedian(values, axis, skipna, mask)
    786         values = values.astype("f8")
    787     except ValueError as err:
    788         # e.g. "could not convert string to float: 'a'"
--> 789         raise TypeError(str(err)) from err
    790 if mask is not None:
    791     values[mask] = np.nan

TypeError: could not convert string to float: 'M'

From the error box above, it shows that groupby do aggregation gender columns. But when I watch someone on YouTube do this with the same dataframe and the same code, it's all fine and shows no error.

So the question is:

Upvotes: 1

Views: 45

Answers (2)

ouroboros1
ouroboros1

Reputation: 14354

Use groupby.median with numeric_only=True.

# change dtype `str` to `int`
cols = data.columns[data.columns.str.contains('year|score')]
data[cols] = data[cols].astype(int)

out = data.groupby('class').median(numeric_only=True)

Output:

         year  score1  score2  score3  score4
class                                        
A      2014.0     9.0     5.0     7.0     7.0
B      2012.0     7.0     8.0     6.0    10.0
C      2013.0     7.5     8.5     8.0     8.5

Alternatively, use df.drop to drop the non-numeric column first:

out2 = data.drop('gender', axis=1).groupby('class').median()

out2.equals(out) # True

In pandas <= 1.5, parameter numeric_only had True as the default. See here. Since 2.0.0, the default has become False (and None is no longer accepted).

Hence, the tutorial video you mention is doubtlessly using 1.5 or earlier, using groupby.median with default numeric_only=True implied.

Upvotes: 1

The issue id due to the columns score1, score2, score3, and coree4 in your DataFrame are stored as strings, not as numeric types. Do this

import pandas as pd

x = {
  "year": ["2012", "2012", "2013", "2014", "2012", "2014", "2013", "2013", "2012", "2013", "2012", "2014", "2014", "2013", "2012", "2014"],
  "class": ["A", "B", "C", "A", "C", "B", "B", "C", "A", "C", "B", "C", "A", "C", "B", "A"],
  "gender": ["M", "F", "F", "M", "F", "M", "M", "F", "F", "F", "M", "M", "F", "M", "F", "F"],
  "score1": ["6", "6", "8", "10", "6", "7", "6", "7", "8", "7", "10", "9", "9", "9", "8", "9"],
  "score2": ["5", "9", "10", "5", "10", "9", "5", "7", "8", "9", "8", "8", "5", "5", "8", "5"],
  "score3": ["5", "9", "9", "7", "8", "5", "9", "5", "7", "6", "5", "10", "8", "8", "6", "8"],
  "score4": ["10", "8", "8", "10", "9", "8", "10", "9", "7", "8", "10", "9", "7", "7", "10", "7"]
}

data = pd.DataFrame(x)
data[["score1", "score2", "score3", "score4"]] = data[["score1", "score2", "score3", "score4"]].apply(pd.to_numeric)

numeric_cols = data.select_dtypes(include='number')
result = numeric_cols.join(data[['class']]).groupby('class').median()
print(result)

which gives

      score1  score2  score3  score4
class                                
A         9.0     5.0     7.0     7.0
B         7.0     8.0     6.0    10.0
C         7.5     8.5     8.0     8.5

Upvotes: 1

Related Questions