Reputation: 831
I have a dataframe with "Week" & "Year" column and needs to calculate month for same as below:
Input:
+----+----+
|Week|Year|
+----+----+
| 50|2012|
| 50|2012|
| 50|2012|
Expected output:
+----+----+-----+
|Week|Year|Month|
+----+----+-----+
| 50|2012|12 |
| 50|2012|12 |
| 50|2012|12 |
Any help would be appreciated. Thanks
Upvotes: 1
Views: 2781
Reputation: 40360
Thanks to @zero323, who pointed me out to the sqlContext.sql query, I converted the query in the following :
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.apache.spark.sql.functions.*;
public class MonthFromWeekSparkSQL {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("MonthFromWeekSparkSQL").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
List myList = Arrays.asList(RowFactory.create(50, 2012), RowFactory.create(50, 2012), RowFactory.create(50, 2012));
JavaRDD myRDD = sc.parallelize(myList);
List<StructField> structFields = new ArrayList<StructField>();
// Create StructFields
StructField structField1 = DataTypes.createStructField("week", DataTypes.IntegerType, true);
StructField structField2 = DataTypes.createStructField("year", DataTypes.IntegerType, true);
// Add StructFields into list
structFields.add(structField1);
structFields.add(structField2);
// Create StructType from StructFields. This will be used to create DataFrame
StructType schema = DataTypes.createStructType(structFields);
DataFrame df = sqlContext.createDataFrame(myRDD, schema);
DataFrame df2 = df.withColumn("yearAndWeek", concat(col("year"), lit(" "), col("week")))
.withColumn("month", month(unix_timestamp(col("yearAndWeek"), "yyyy w").cast(("timestamp")))).drop("yearAndWeek");
df2.show();
}
}
You actually create a new column with year and week formatted as "yyyy w" then convert it using unix_timestamp from which you can pull the month as you see.
PS: It seems that cast behavior was incorrect in spark 1.5 - https://issues.apache.org/jira/browse/SPARK-11724
So in that case, it's more general to do .cast("double").cast("timestamp")
Upvotes: 2