Reputation: 127
I made the Dockerfile as :
# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License
ARG BASE_CONTAINER=jupyter/scipy-notebook
FROM $BASE_CONTAINER
LABEL maintainer="Jupyter Project <[email protected]>"
USER root
# Spark dependencies
ENV SPARK_VERSION 2.3.2
ENV SPARK_HADOOP_PROFILE 2.7
ENV SPARK_SRC_URL https://www.apache.org/dist/spark/spark-$SPARK_VERSION/spark-${SPARK_VERSION}-bin-hadoop${SPARK_HADOOP_PROFILE}.tgz
ENV SPARK_HOME=/opt/spark
ENV PATH $PATH:$SPARK_HOME/bin
RUN apt-get update && \
apt-get install -y openjdk-8-jdk-headless \
postgresql && \
rm -rf /var/lib/apt/lists/*
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
ENV PATH $PATH:$JAVA_HOME/bin
RUN wget ${SPARK_SRC_URL}
RUN tar -xzf spark-${SPARK_VERSION}-bin-hadoop${SPARK_HADOOP_PROFILE}.tgz
RUN mv spark-${SPARK_VERSION}-bin-hadoop${SPARK_HADOOP_PROFILE} /opt/spark
RUN rm -f spark-${SPARK_VERSION}-bin-hadoop${SPARK_HADOOP_PROFILE}.tgz
USER $NB_UID
ENV POST_URL https://jdbc.postgresql.org/download/postgresql-42.2.5.jar
RUN wget ${POST_URL}
RUN mv postgresql-42.2.5.jar $SPARK_HOME/jars
# Install pyarrow
RUN conda install --quiet -y 'pyarrow' && \
conda install pyspark==2.3.2 && \
conda clean -tipsy && \
fix-permissions $CONDA_DIR && \
fix-permissions /home/$NB_USER
WORKDIR $SPARK_HOME
Then I ran the command to make the my_notebook image as : docker build -t my_notebook .
Then I made three containers Master,Worker and Notebook as follows :
Master using docker-compose file :
master:
image: my_notebook
command: bin/spark-class org.apache.spark.deploy.master.Master -h master
hostname: master
environment:
MASTER: spark://master:7077
SPARK_CONF_DIR: /conf
SPARK_PUBLIC_DNS: 192.168.XXX.XXX
expose:
- 7001
- 7002
- 7003
- 7004
- 7005
- 7077
- 6066
ports:
- 4040:4040
- 6066:6066
- 7077:7077
- 8080:8080
volumes:
- ./conf/master:/conf
- ./data:/tmp/data
Worker using docker-compose file :
worker:
image: my_notebook
command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://192.168.XXX.XXX:7077
hostname: worker
environment:
SPARK_CONF_DIR: /conf
SPARK_WORKER_CORES: 4
SPARK_WORKER_MEMORY: 4g
SPARK_WORKER_PORT: 8881
SPARK_WORKER_WEBUI_PORT: 8081
SPARK_BLOCKMGR_PORT: 5003
SPARK_PUBLIC_DNS: localhost
expose:
- 7012
- 7013
- 7014
- 7015
- 8881
- 5001
- 5003
ports:
- 8081:8081
volumes:
- ./conf/worker:/conf
- ./data:/tmp/data
Notebook using docker-compose file :
notebook:
image: my_notebook
command: jupyter notebook
hostname: notebook
environment:
SPARK_PUBLIC_DNS: 192.168.XXX.XXX
expose:
- 7012
- 7013
- 7014
- 7015
- 8881
- 8888
ports:
- 8888:8888
First I started the Master container in one machine as : "docker-compose up" Then started Worker on the other machine "docker-compose up" Then started notebook in some other machine "docker-compose up"
Spark cluster was setup. Spark UI was accessible. Workers were registered too in the cluster.Jupyter notebook also started successfully. But the problem being faced is that when I run the pyspark application through jupyter worker executors failed to connect back to spark driver. Error logs are :
Spark Executor Command: "/usr/lib/jvm/java-8-openjdk-amd64//bin/java" "-cp" "/conf/:/opt/spark/jars/*" "-Xmx1024M" "-Dspark.driver.port=35147" "org.apache.spark.executor.CoarseGrainedExecutorBackend" "--driver-url" "spark://CoarseGrainedScheduler@notebook:35147" "--executor-id" "31" "--hostname" "172.17.0.3" "--cores" "2" "--app-id" "app-20190101134023-0001" "--worker-url" "spark://[email protected]:8881"
========================================
Exception in thread "main" java.lang.reflect.UndeclaredThrowableException
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1713)
at org.apache.spark.deploy.SparkHadoopUtil.runAsSparkUser(SparkHadoopUtil.scala:63)
at org.apache.spark.executor.CoarseGrainedExecutorBackend$.run(CoarseGrainedExecutorBackend.scala:188)
at org.apache.spark.executor.CoarseGrainedExecutorBackend$.main(CoarseGrainedExecutorBackend.scala:293)
at org.apache.spark.executor.CoarseGrainedExecutorBackend.main(CoarseGrainedExecutorBackend.scala)
Caused by: org.apache.spark.SparkException: Exception thrown in awaitResult:
at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:205)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:101)
at org.apache.spark.executor.CoarseGrainedExecutorBackend$$anonfun$run$1.apply$mcV$sp(CoarseGrainedExecutorBackend.scala:201)
at org.apache.spark.deploy.SparkHadoopUtil$$anon$2.run(SparkHadoopUtil.scala:64)
at org.apache.spark.deploy.SparkHadoopUtil$$anon$2.run(SparkHadoopUtil.scala:63)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)
... 4 more
Caused by: java.io.IOException: Failed to connect to notebook:35147
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:245)
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:187)
at org.apache.spark.rpc.netty.NettyRpcEnv.createClient(NettyRpcEnv.scala:198)
at org.apache.spark.rpc.netty.Outbox$$anon$1.call(Outbox.scala:194)
at org.apache.spark.rpc.netty.Outbox$$anon$1.call(Outbox.scala:190)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.UnknownHostException: notebook
at java.net.InetAddress.getAllByName0(InetAddress.java:1281)
at java.net.InetAddress.getAllByName(InetAddress.java:1193)
at java.net.InetAddress.getAllByName(InetAddress.java:1127)
at java.net.InetAddress.getByName(InetAddress.java:1077)
at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:146)
at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:143)
at java.security.AccessController.doPrivileged(Native Method)
at io.netty.util.internal.SocketUtils.addressByName(SocketUtils.java:143)
at io.netty.resolver.DefaultNameResolver.doResolve(DefaultNameResolver.java:43)
at io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:63)
at io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:55)
at io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:57)
at io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:32)
at io.netty.resolver.AbstractAddressResolver.resolve(AbstractAddressResolver.java:108)
at io.netty.bootstrap.Bootstrap.doResolveAndConnect0(Bootstrap.java:208)
at io.netty.bootstrap.Bootstrap.access$000(Bootstrap.java:49)
at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:188)
at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:174)
at io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:507)
at io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:481)
at io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:420)
at io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:104)
at io.netty.channel.DefaultChannelPromise.trySuccess(DefaultChannelPromise.java:82)
at io.netty.channel.AbstractChannel$AbstractUnsafe.safeSetSuccess(AbstractChannel.java:978)
at io.netty.channel.AbstractChannel$AbstractUnsafe.register0(AbstractChannel.java:512)
at io.netty.channel.AbstractChannel$AbstractUnsafe.access$200(AbstractChannel.java:423)
at io.netty.channel.AbstractChannel$AbstractUnsafe$1.run(AbstractChannel.java:482)
at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:163)
at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:403)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:463)
at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:858)
at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:138)
... 1 more
Anyone who can help me?
Upvotes: 1
Views: 2068
Reputation: 71
In order for a url like notebook:35147
to work, containers have to be in the same network. In your case, you are launching containers on difference machines, so that network has to be an overlay network.
The best solution is to use docker swarm
and docker stack
instead of docker-compose, but for the sake of not encumbering you with new things, let's just stick to compose for now.
First create such a network
We need a little help from the swarm mode:
On one machine (manager) do docker swarm init
and docker network create --driver=overlay --attachable my-network
On other machines, docker swarm join
with the token you got on the manager.
Then modify all your compose files to have this at the end
networks:
my-network:
external: true
and add this for services in question
networks:
my-network
Upvotes: 2