Reputation: 1213
What is wrong with my code, I am using pyspark to convert a data type of a column.
company_df=company_df.withColumn("Revenue" ,company_df("Revenue").cast(DoubleType())) \
.withColumn("GROSS_PROFIT",company_df("GROSS_PROFIT").cast(DoubleType())) \
.withColumn("Net_Income" ,company_df("Net_Income").cast(DoubleType())) \
.withColumn("Enterprise_Value" ,company_df("Enterprise_Value").cast(DoubleType())) \
I am getting error as :
AttributeError: 'DataFrame' object has no attribute 'cast'
Upvotes: 0
Views: 9562
Reputation: 44941
A short, clean, scalable solution
Change some columns, leave the rest untouched
import pyspark.sql.functions as F
# That's not part of the solution, just a creation of a sample dataframe
# df = spark.createDataFrame([(10, 1,2,3,4),(20, 5,6,7,8)],'Id int, Revenue int ,GROSS_PROFIT int ,Net_Income int ,Enterprise_Value int')
cols_to_cast = ["Revenue" ,"GROSS_PROFIT" ,"Net_Income" ,"Enterprise_Value"]
df = df.select([F.col(c).cast('double') if c in cols_to_cast else c for c in df.columns])
df.printSchema()
root
|-- Id: integer (nullable = true)
|-- Revenue: double (nullable = true)
|-- GROSS_PROFIT: double (nullable = true)
|-- Net_Income: double (nullable = true)
|-- Enterprise_Value: double (nullable = true)
Upvotes: 3
Reputation: 1739
Alternatively, to @wwnde's answer you could do something as below -
from pyspark.sql.functions import *
from pyspark.sql.types import *
company_df = (company_df.withColumn("Revenue_cast" , col("Revenue_cast").cast(DoubleType()))
.withColumn("GROSS_PROFIT_cast", col("GROSS_PROFIT").cast(DoubleType()))
.withColumn("Net_Income_cast" , col("Net_Income").cast(DoubleType()))
.withColumn("Enterprise_Value_cast", col("Enterprise_Value").cast(DoubleType()))
)
Or,
company_df = (company_df.withColumn("Revenue_cast" , company_df["Revenue"].cast(DoubleType()))
.withColumn("GROSS_PROFIT_cast", company_df["GROSS_PROFIT".cast(DoubleType()))
.withColumn("Net_Income_cast" , company_df["Net_Income".cast(DoubleType()))
.withColumn("Enterprise_Value_cast", company_df["Enterprise_Value"].cast(DoubleType()))
)
Upvotes: 1
Reputation: 26676
If this helps
df = spark.createDataFrame([(1, 0),
(2, 1),
(3 ,1),
(4, 1),
(5, 0),
(6 ,0),
(7, 1),
(8 ,1),
(9 ,1),
(10, 1),
(11, 0),
(12, 0)],
('Time' ,'Tag1'))
df = df.withColumn('a', col('Time').cast('integer')).withColumn('a1', col('Tag1').cast('double'))
df.printSchema()
df.show()
Upvotes: 1