Reputation: 511
I have a csv file and would like to load it into a parquet file on my hard drive, then run SQL queries against it using the spark-sql CLI. Is there a spark-sql command or two that would do this?
If spark-sql is not the right method, then what would you suggest is the simplest way to load csv into parquet? After this step, my next task is to run sql queries on the data
Upvotes: 2
Views: 322
Reputation: 1214
package spark
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, trim}
object csv2parquet extends App {
val spark = SparkSession.builder()
.master("local")
.appName("CSV-Parquet")
.getOrCreate()
import spark.implicits._
val sourceFile = "/<path file>/test.csv" // bad data in file
val targetFile = "/<path file>/testResult.parquet"
// read csv file
val df1 = spark.read.option("header", false).csv(sourceFile)
df1.show(false)
// +-------+-------+----------+-----------+
// |_c0 |_c1 |_c2 |_c3 |
// +-------+-------+----------+-----------+
// |Header |TestApp|2020-01-01|null |
// |name | dept | age | batchDate |
// |john | dept1 | 33 | 2020-01-01|
// |john | dept1 | 33 | 2020-01-01|
// |john | dept1 | 33 | 2020-01-01|
// |john | dept1 | 33 | 2020-01-01|
// |Trailer|count |4 |null |
// +-------+-------+----------+-----------+
// write data to parquet.
df1.write.mode("append").parquet(targetFile)
val resDF = spark.read.parquet(targetFile)
resDF.show(false)
// +-------+-------+----------+-----------+
// |_c0 |_c1 |_c2 |_c3 |
// +-------+-------+----------+-----------+
// |Header |TestApp|2020-01-01|null |
// |name | dept | age | batchDate |
// |john | dept1 | 33 | 2020-01-01|
// |john | dept1 | 33 | 2020-01-01|
// |john | dept1 | 33 | 2020-01-01|
// |john | dept1 | 33 | 2020-01-01|
// |Trailer|count |4 |null |
// +-------+-------+----------+-----------+
// try sql
resDF
.filter(trim(col("_c2")).equalTo(33))
.select(col("_c2"))
.show(false)
// +---+
// |_c2|
// +---+
// | 33|
// | 33|
// | 33|
// | 33|
// +---+
}
Upvotes: 1