Reputation: 1430
I have a testing spark environment(Single Node) running on AWS. I executed few adhoc queries in PySpark shell and everything went as expected, however, when I'm running the application using spark-submit , I get error.
Below is the code:
from __future__ import print_function
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext as sql
conf = SparkConf().setAppName("myapp")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
if __name__ == "__main__":
#inp_data = loaded data from db
df = inp_data.select('Id','DueDate','Principal','delay','unpaid_emi','future_payment')
filterd_unpaid_emi = df.filter(df.unpaid_emi == 1)
par = filterd_unpaid_emi.groupBy('Id').sum('Principal').withColumnRenamed('sum(Principal)' , 'par')
temp_df = df.filter(df.unpaid_emi == 1)
temp_df_1 = temp_df.filter(temp_df.future_payment == 0)
temp_df_1.registerTempTable("mytable")
bucket_df_1 = sql("""select *, case
when delay<0 and delay ==0 then '9999'
when delay>0 and delay<7 then '9'
when delay>=7 and delay<=14 then '8'
when delay>=15 and delay<=29 then '7'
when delay>=30 and delay<=59 then '6'
when delay>=60 and delay<=89 then '5'
when delay>=90 and delay<=119 then '4'
when delay>=120 and delay<=149 then '3'
when delay>=150 and delay<=179 then '2'
else '1'
end as bucket
from mytable""")
bucket_df_1 = bucket_df_1.select(bucket_df_1.Id,bucket_df_1.Principal,bucket_df_1.delay,bucket_df_1.unpaid_emi,bucket_df_1.future_payment,bucket_df_1.bucket.cast("int").alias('buckets'))
min_bucket = bucket_df_1.groupBy('Id').min('buckets').withColumnRenamed('min(buckets)' , 'max_delay')
joinedDf = par.join(min_bucket, ["Id"])
#joinedDf.printSchema()
And below is the command to submit the application:
spark-submit \
--master yarn \
--driver-class-path /path to/mysql-connector-java-5.0.8-bin.jar \
--jars /path to/mysql-connector-java-5.0.8-bin.jar \
/path to/mycode.py
ERROR:
17/11/10 10:00:34 INFO SparkSqlParser: Parsing command: mytable
Traceback (most recent call last):
File "/path to/mycode.py", line 36, in <module>
from mytable""")
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/context.py", line 73, in __init__
AttributeError: 'str' object has no attribute '_jsc'
17/11/10 10:00:34 INFO SparkContext: Invoking stop() from shutdown hook
17/11/10 10:00:34 INFO SparkUI: Stopped Spark web UI at ........
I'm quite new to Spark so can someone please tell the mistake(s) i'm doing.? Also, any feedback on improving coding style will be appreciated!
Spark Version : 2.2
Upvotes: 0
Views: 336
Reputation: 2708
You are using the imported SQLContext as sql to query your temp table (which is not bound to any spark instances), not the spark.sql
(from the initialized spark instance). I also, changed some of your imports and code.
from __future__ import print_function
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
if __name__ == "__main__":
# move the initializations within the main
conf = SparkConf().setAppName("myapp")
# create the session
spark = SparkSession.builder.config(conf=conf) \
.getOrCreate()
# load your data and do what you need to do
#inp_data = loaded data from db
df = inp_data.select('Id','DueDate','Principal','delay','unpaid_emi','future_payment')
filterd_unpaid_emi = df.filter(df.unpaid_emi == 1)
par = filterd_unpaid_emi.groupBy('Id').sum('Principal').withColumnRenamed('sum(Principal)' , 'par')
temp_df = df.filter(df.unpaid_emi == 1)
temp_df_1 = temp_df.filter(temp_df.future_payment == 0)
temp_df_1.registerTempTable("mytable")
# use spark.sql to query your table
bucket_df_1 = spark.sql("""select *, case
when delay<0 and delay ==0 then '9999'
when delay>0 and delay<7 then '9'
when delay>=7 and delay<=14 then '8'
when delay>=15 and delay<=29 then '7'
when delay>=30 and delay<=59 then '6'
when delay>=60 and delay<=89 then '5'
when delay>=90 and delay<=119 then '4'
when delay>=120 and delay<=149 then '3'
when delay>=150 and delay<=179 then '2'
else '1'
end as bucket
from mytable""")
bucket_df_1 = bucket_df_1.select(bucket_df_1.Id,bucket_df_1.Principal,bucket_df_1.delay,bucket_df_1.unpaid_emi,bucket_df_1.future_payment,bucket_df_1.bucket.cast("int").alias('buckets'))
min_bucket = bucket_df_1.groupBy('Id').min('buckets').withColumnRenamed('min(buckets)' , 'max_delay')
joinedDf = par.join(min_bucket, ["Id"])
#joinedDf.printSchema()
Hope this helps, good luck!
Upvotes: 1