Reputation: 129
I have a DataFrame like this.
Name City Name_index City_index
Ali lhr 2.0 0.0
abc swl 0.0 2.0
xyz khi 1.0 1.0
I want to drop columns that don't contain string like "index".
Expected Output should be like:
Name_index City_index
2.0 0.0
0.0 2.0
1.0 1.0
I have tried this.
val cols = newDF.columns
val regex = """^((?!_indexed).)*$""".r
val selection = cols.filter(s => regex.findFirstIn(s).isDefined)
cols.diff(selection)
val res =newDF.select(selection.head, selection.tail : _*)
res.show()
But I am getting this:
Name City
Ali lhr
abc swl
xyz khi
Upvotes: 2
Views: 1894
Reputation: 4133
import org.apache.spark.sql.functions.col
val regex = """^((?!_indexed).)*$""".r
val schema = StructType(
Seq(StructField("Name", StringType, false),
StructField("City", StringType, false),
StructField("Name_indexed", IntegerType, false),
StructField("City_indexed", LongType, false)))
val empty: DataFrame = spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema = schema)
val columns = schema.map(_.name).filter(el => regex.pattern.matcher(el).matches())
empty.select(columns.map(col):_*).show()
It gives
+----+----+
|Name|City|
+----+----+
+----+----+
Upvotes: 0
Reputation: 4045
There is a typo in your regex , fixed it in below code
import org.apache.spark.sql.SparkSession
object FilterColumn {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[*]").getOrCreate()
import spark.implicits._
val newDF = List(PersonCity("Ali","lhr",2.0,0.0)).toDF()
newDF.show()
val cols = newDF.columns
val regex = """^((?!_index).)*$""".r
val selection = cols.filter(s => regex.findFirstIn(s).isDefined)
val finalCols = cols.diff(selection)
val res =newDF.select(finalCols.head,finalCols.tail: _*)
res.show()
}
}
case class PersonCity(Name : String, City :String, Name_index : Double, City_index: Double)
Upvotes: 1