Reputation: 569
I am trying to extract data from big query table by using Apache Spark BigQuery Storage connector in Jupyter Notebooks. I need to extract this data and write into gcs bucket using some partitioning. I have created a data proc cluster in GCP and running Jupyter Notebook in it. Below is my code
Based on scala version, I am using the correct version of the spark-bigquery-connector jar out of below
Scala version 2.11 - 'gs://spark-lib/bigquery/spark-bigquery-latest.jar'.
Scala version 2.12 - 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar'.
!scala -version
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName('1.2. BigQuery Storage & Spark SQL - Python')\
.config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar')\
.config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.15.1-beta')\
.config("viewsEnabled","true")\
.getOrCreate()
here the printschema() works fine for the table i am looking at but for some reason df_wiki_pageviews is not working. i dont need to do ds.show() as I have enabled repl.eagerEval
table = "table_name"
df_wiki_pageviews = spark.read \
.format("bigquery") \
.option("table", table) \
.load()
df_wiki_pageviews.printSchema()
#df_wiki_pageviews.write.format("parquet").save("gs://rdl-stage-raw/store_dim")
df_wiki_pageviews
The error I am getting is as below. can someone please help me with whats going wrong?
I am using Python 3 kernel (not PySpark) to configure the SparkSession in the notebook and include the spark-bigquery-connector required to use the BigQuery Storage API.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
/opt/conda/miniconda3/lib/python3.8/site-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
/opt/conda/miniconda3/lib/python3.8/site-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
/opt/conda/miniconda3/lib/python3.8/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
/usr/lib/spark/python/pyspark/sql/dataframe.py in __repr__(self)
489 if not self._support_repr_html and self.sql_ctx._conf.isReplEagerEvalEnabled():
490 vertical = False
--> 491 return self._jdf.showString(
492 self.sql_ctx._conf.replEagerEvalMaxNumRows(),
493 self.sql_ctx._conf.replEagerEvalTruncate(), vertical)
/opt/conda/miniconda3/lib/python3.8/site-packages/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
/opt/conda/miniconda3/lib/python3.8/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o242.showString.
: java.lang.NoSuchMethodError: com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.AbstractManagedChannelImplBuilder.<init>(Ljava/lang/String;)V
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelBuilder.<init>(NettyChannelBuilder.java:136)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelBuilder.<init>(NettyChannelBuilder.java:131)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelBuilder.forAddress(NettyChannelBuilder.java:117)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelProvider.builderForAddress(NettyChannelProvider.java:37)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelProvider.builderForAddress(NettyChannelProvider.java:23)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.ManagedChannelBuilder.forAddress(ManagedChannelBuilder.java:39)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.createSingleChannel(InstantiatingGrpcChannelProvider.java:348)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.access$1900(InstantiatingGrpcChannelProvider.java:82)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider$1.createSingleChannel(InstantiatingGrpcChannelProvider.java:239)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.ChannelPool.create(ChannelPool.java:72)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.createChannel(InstantiatingGrpcChannelProvider.java:249)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.getTransportChannel(InstantiatingGrpcChannelProvider.java:227)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.rpc.ClientContext.create(ClientContext.java:205)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.storage.v1.stub.EnhancedBigQueryReadStub.create(EnhancedBigQueryReadStub.java:98)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.storage.v1.BigQueryReadClient.<init>(BigQueryReadClient.java:130)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.storage.v1.BigQueryReadClient.create(BigQueryReadClient.java:110)
at com.google.cloud.spark.bigquery.direct.DirectBigQueryRelation$.createReadClient(DirectBigQueryRelation.scala:381)
at com.google.cloud.spark.bigquery.direct.DirectBigQueryRelation$.$anonfun$$lessinit$greater$default$3$1(DirectBigQueryRelation.scala:43)
at com.google.cloud.spark.bigquery.direct.DirectBigQueryRelation.buildScan(DirectBigQueryRelation.scala:126)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$apply$4(DataSourceStrategy.scala:332)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$pruneFilterProject$1(DataSourceStrategy.scala:365)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProjectRaw(DataSourceStrategy.scala:442)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProject(DataSourceStrategy.scala:364)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:332)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67)
at org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:391)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:104)
at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:143)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:143)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:104)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:97)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:117)
at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:143)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:143)
at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:117)
at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:110)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$simpleString$2(QueryExecution.scala:161)
at org.apache.spark.sql.execution.ExplainUtils$.processPlan(ExplainUtils.scala:115)
at org.apache.spark.sql.execution.QueryExecution.simpleString(QueryExecution.scala:161)
at org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$explainString(QueryExecution.scala:206)
at org.apache.spark.sql.execution.QueryExecution.explainString(QueryExecution.scala:175)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:98)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
/opt/conda/miniconda3/lib/python3.8/site-packages/IPython/core/formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
/usr/lib/spark/python/pyspark/sql/dataframe.py in _repr_html_(self)
504 if self.sql_ctx._conf.isReplEagerEvalEnabled():
505 max_num_rows = max(self.sql_ctx._conf.replEagerEvalMaxNumRows(), 0)
--> 506 sock_info = self._jdf.getRowsToPython(
507 max_num_rows, self.sql_ctx._conf.replEagerEvalTruncate())
508 rows = list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
/opt/conda/miniconda3/lib/python3.8/site-packages/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
/opt/conda/miniconda3/lib/python3.8/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o242.getRowsToPython.
: java.lang.NoSuchMethodError: com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.AbstractManagedChannelImplBuilder.<init>(Ljava/lang/String;)V
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelBuilder.<init>(NettyChannelBuilder.java:136)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelBuilder.<init>(NettyChannelBuilder.java:131)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelBuilder.forAddress(NettyChannelBuilder.java:117)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelProvider.builderForAddress(NettyChannelProvider.java:37)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelProvider.builderForAddress(NettyChannelProvider.java:23)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.ManagedChannelBuilder.forAddress(ManagedChannelBuilder.java:39)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.createSingleChannel(InstantiatingGrpcChannelProvider.java:348)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.access$1900(InstantiatingGrpcChannelProvider.java:82)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider$1.createSingleChannel(InstantiatingGrpcChannelProvider.java:239)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.ChannelPool.create(ChannelPool.java:72)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.createChannel(InstantiatingGrpcChannelProvider.java:249)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.getTransportChannel(InstantiatingGrpcChannelProvider.java:227)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.rpc.ClientContext.create(ClientContext.java:205)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.storage.v1.stub.EnhancedBigQueryReadStub.create(EnhancedBigQueryReadStub.java:98)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.storage.v1.BigQueryReadClient.<init>(BigQueryReadClient.java:130)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.storage.v1.BigQueryReadClient.create(BigQueryReadClient.java:110)
at com.google.cloud.spark.bigquery.direct.DirectBigQueryRelation$.createReadClient(DirectBigQueryRelation.scala:381)
at com.google.cloud.spark.bigquery.direct.DirectBigQueryRelation$.$anonfun$$lessinit$greater$default$3$1(DirectBigQueryRelation.scala:43)
at com.google.cloud.spark.bigquery.direct.DirectBigQueryRelation.buildScan(DirectBigQueryRelation.scala:126)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$apply$4(DataSourceStrategy.scala:332)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$pruneFilterProject$1(DataSourceStrategy.scala:365)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProjectRaw(DataSourceStrategy.scala:442)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProject(DataSourceStrategy.scala:364)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:332)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:67)
at org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:391)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:104)
at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:143)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:143)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:104)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:97)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:117)
at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:143)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:143)
at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:117)
at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:110)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$simpleString$2(QueryExecution.scala:161)
at org.apache.spark.sql.execution.ExplainUtils$.processPlan(ExplainUtils.scala:115)
at org.apache.spark.sql.execution.QueryExecution.simpleString(QueryExecution.scala:161)
at org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$explainString(QueryExecution.scala:206)
at org.apache.spark.sql.execution.QueryExecution.explainString(QueryExecution.scala:175)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:98)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
at org.apache.spark.sql.Dataset.getRowsToPython(Dataset.scala:3539)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Upvotes: 3
Views: 2202
Reputation: 30448
No need to configure both spark.jars
and spark.jars.packages
. The error is because you've added two connectors, one in a very old version (0.15) and the recent one (0.23). It would be best if you stick to just one configuration:
.config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.23.1')
Upvotes: 1