PySpark: TypeError: unsupported operand type(s) for +: 'datetime.datetime' and 'str'

Question

I have DataFrame in PySpark that has the following schema:

root
 |-- id: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- time: string (nullable = true)
 |-- start: timestamp (nullable = true)
 |-- end: timestamp (nullable = true)

I want to add one more column date_time of the type timestamp:

import datetime

to_datetime_func =  udf (lambda d, t: datetime.strptime(d+" "+t, "%Y-%m-%d %H:%M:%S"), TimestampType())
df = df.withColumn("date_time", to_datetime_func("date","time"))

This code compiles well. However, when I run a simple filteroperation that uses date_time column, I get the error:

root
 |-- id: string (nullable = true)
 |-- date_time: timestamp (nullable = true)
 |-- start: timestamp (nullable = true)
 |-- end: timestamp (nullable = true)


from pyspark.sql import functions as func

df \
    .filter(func.col("date_time")>=func.col("start"))
    .select("id","date_time","start") \
    .show()

Error:

Py4JJavaError: An error occurred while calling o2966.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 30.0 failed 4 times, most recent failure: Lost task 2.3 in stage 30.0 (TID 765, 10.139.64.4, executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/worker.py", line 403, in main
    process()
  File "/databricks/spark/python/pyspark/worker.py", line 398, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/databricks/spark/python/pyspark/serializers.py", line 365, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/databricks/spark/python/pyspark/serializers.py", line 147, in dump_stream
    for obj in iterator:
  File "/databricks/spark/python/pyspark/serializers.py", line 354, in _batched
    for item in iterator:
  File "", line 1, in 
  File "/databricks/spark/python/pyspark/worker.py", line 83, in 
    return lambda *a: toInternal(f(*a))
  File "/databricks/spark/python/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "", line 1, in 
TypeError: unsupported operand type(s) for +: 'datetime.datetime' and 'str'

    at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:490)
    at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
    at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
    at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:444)
    at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
    at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:638)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
    at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$class.foreach(Iterator.scala:891)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
    at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:299)
    at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.writeIteratorToStream(PythonUDFRunner.scala:50)
    at org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:383)
    at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2076)
    at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:223)

UPDATE:

my_concat_func =  udf (lambda d, t: datetime.strptime(d+" "+t, "%Y-%m-%d %H:%M:%S"), StringType())
df = df.withColumn("date", df["date"].cast(StringType()))
df = df.withColumn("date_time", my_concat_func("date","time"))


df.select("date","time","date_time").printSchema()

root
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- date_time: string (nullable = true)


df.select("date","time","date_time").show()

ValueError: unconverted data remains: 03:34:26

PySpark: TypeError: unsupported operand type(s) for +: 'datetime.datetime' and 'str'

Answers (1)

Related Questions

PySpark: TypeError: unsupported operand type(s) for +: &#39;datetime.datetime&#39; and &#39;str&#39;

Answers (1)

Related Questions

PySpark: TypeError: unsupported operand type(s) for +: 'datetime.datetime' and 'str'