How to make FunctionTransformer work in DataFrameMapper

I have a column in my pandas DataFrame that looks like this:

df = pd.DataFrame([
    ['26.6 km'],
    ['19.67 km'],
    ['18.2 km'],
    ['20.77 km'],
    ['15.2 km'],
], columns=['Mileage'])

I have a function that removes the ' km' from the column:

def remove_words(column):
    return column.str.split(' ').str[0]

when I put it in my DataFrameMapper as:

mapper = DataFrameMapper([
     ('Mileage', [FunctionTransformer(remove_words)]),
     ], df_out=True)

...it returns error "'numpy.ndarray' object has no attribute 'str'"

Help!

Upvotes: 1

Views: 823

Answers (2)

emehex
emehex

Reputation: 10538

As of sklearn-pandas=2.0.0 some existing answers no longer work.

Here's how I am now implementing this workflow:

import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn_pandas import DataFrameMapper

df = pd.DataFrame([
    ['26.6 km'],
    ['19.67 km'],
    ['18.2 km'],
    ['20.77 km'],
    ['15.2 km'],
], columns=['Mileage'])

def text_transform(text):
    return text.split(' ')[0]

def function_to_column(column):
    # make the numpy array flat like a list
    flat = np.reshape(column, newshape=(-1))
    transformed = [text_transform(single_text) for single_text in flat]
    # turn the flat, transformed list back into a numpy array
    array = np.array(transformed)
    return array

mapper = DataFrameMapper([
    ('Mileage', FunctionTransformer(function_to_column, validate=False))
 ], df_out=True)

mapper.fit_transform(df)

This should generate:

    Mileage
0   26.6
1   19.67
2   18.2
3   20.77
4   15.2

Upvotes: 0

E. Zeytinci
E. Zeytinci

Reputation: 2643

Use extract or replace

df['Mileage'] = df['Mileage'].str.extract('(\d*\.?\d*)', expand=False).astype(float)

or,

df['Mileage'] = df['Mileage'].str.replace('[^\d.]', '').astype(float)

Here is example,

>>> import pandas as pd
>>> df = pd.DataFrame([
    ['26.6 km'],
    ['19.67 km'],
    ['18.2 km'],
    ['20.77 km'],
    ['15.2 km'],
], columns=['Mileage'])
>>> df['Mileage'].str.extract('(\d*\.?\d*)', expand=False).astype(float)
0    26.60
1    19.67
2    18.20
3    20.77
4    15.20
Name: Mileage, dtype: float64
>>> df['Mileage'].str.replace('[^\d.]', '').astype(float)
0    26.60
1    19.67
2    18.20
3    20.77
4    15.20
Name: Mileage, dtype: float64

Or if you want to use DataFrameMapper and FunctionTransformer from sklearn_pandas,

from sklearn_pandas import DataFrameMapper, FunctionTransformer

def remove_words(val):
    return val.split(' ')[0]

mapper = DataFrameMapper([
     ('Mileage', [FunctionTransformer(remove_words)]),
     ], df_out=True)

print(mapper.fit_transform(df))

  Mileage
0    26.6
1   19.67
2    18.2
3   20.77
4    15.2

For sklearn.preprocessing.FunctionTransformer,

from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import FunctionTransformer
import numpy as np

def remove_words(vals):
    return np.array([v[0].split(' ')[0] for v in vals])

mapper = DataFrameMapper([
     (['Mileage'], [FunctionTransformer(remove_words, validate=False)]),
     ], df_out=True)

print(mapper.fit_transform(df))

  Mileage
0    26.6
1   19.67
2    18.2
3   20.77
4    15.2

Or use numpy.vectorize

from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import FunctionTransformer
import numpy as np

func = np.vectorize(lambda x: x.split(' ')[0])

def remove_words(vals):
    return func(vals)

mapper = DataFrameMapper([
     (['Mileage'], [FunctionTransformer(remove_words, validate=False)]),
     ], df_out=True)

print(mapper.fit_transform(df))

  Mileage
0    26.6
1   19.67
2    18.2
3   20.77
4    15.2

Upvotes: 1

Related Questions