Reputation: 57
I have a column in my pandas DataFrame that looks like this:
df = pd.DataFrame([
['26.6 km'],
['19.67 km'],
['18.2 km'],
['20.77 km'],
['15.2 km'],
], columns=['Mileage'])
I have a function that removes the ' km' from the column:
def remove_words(column):
return column.str.split(' ').str[0]
when I put it in my DataFrameMapper as:
mapper = DataFrameMapper([
('Mileage', [FunctionTransformer(remove_words)]),
], df_out=True)
...it returns error "'numpy.ndarray' object has no attribute 'str'"
Help!
Upvotes: 1
Views: 823
Reputation: 10538
As of sklearn-pandas=2.0.0
some existing answers no longer work.
Here's how I am now implementing this workflow:
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn_pandas import DataFrameMapper
df = pd.DataFrame([
['26.6 km'],
['19.67 km'],
['18.2 km'],
['20.77 km'],
['15.2 km'],
], columns=['Mileage'])
def text_transform(text):
return text.split(' ')[0]
def function_to_column(column):
# make the numpy array flat like a list
flat = np.reshape(column, newshape=(-1))
transformed = [text_transform(single_text) for single_text in flat]
# turn the flat, transformed list back into a numpy array
array = np.array(transformed)
return array
mapper = DataFrameMapper([
('Mileage', FunctionTransformer(function_to_column, validate=False))
], df_out=True)
mapper.fit_transform(df)
This should generate:
Mileage
0 26.6
1 19.67
2 18.2
3 20.77
4 15.2
Upvotes: 0
Reputation: 2643
df['Mileage'] = df['Mileage'].str.extract('(\d*\.?\d*)', expand=False).astype(float)
or,
df['Mileage'] = df['Mileage'].str.replace('[^\d.]', '').astype(float)
Here is example,
>>> import pandas as pd
>>> df = pd.DataFrame([
['26.6 km'],
['19.67 km'],
['18.2 km'],
['20.77 km'],
['15.2 km'],
], columns=['Mileage'])
>>> df['Mileage'].str.extract('(\d*\.?\d*)', expand=False).astype(float)
0 26.60
1 19.67
2 18.20
3 20.77
4 15.20
Name: Mileage, dtype: float64
>>> df['Mileage'].str.replace('[^\d.]', '').astype(float)
0 26.60
1 19.67
2 18.20
3 20.77
4 15.20
Name: Mileage, dtype: float64
Or if you want to use DataFrameMapper
and FunctionTransformer
from sklearn_pandas
,
from sklearn_pandas import DataFrameMapper, FunctionTransformer
def remove_words(val):
return val.split(' ')[0]
mapper = DataFrameMapper([
('Mileage', [FunctionTransformer(remove_words)]),
], df_out=True)
print(mapper.fit_transform(df))
Mileage
0 26.6
1 19.67
2 18.2
3 20.77
4 15.2
For sklearn.preprocessing.FunctionTransformer
,
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import FunctionTransformer
import numpy as np
def remove_words(vals):
return np.array([v[0].split(' ')[0] for v in vals])
mapper = DataFrameMapper([
(['Mileage'], [FunctionTransformer(remove_words, validate=False)]),
], df_out=True)
print(mapper.fit_transform(df))
Mileage
0 26.6
1 19.67
2 18.2
3 20.77
4 15.2
Or use numpy.vectorize
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import FunctionTransformer
import numpy as np
func = np.vectorize(lambda x: x.split(' ')[0])
def remove_words(vals):
return func(vals)
mapper = DataFrameMapper([
(['Mileage'], [FunctionTransformer(remove_words, validate=False)]),
], df_out=True)
print(mapper.fit_transform(df))
Mileage
0 26.6
1 19.67
2 18.2
3 20.77
4 15.2
Upvotes: 1