Reputation: 1
docker-compose.yml
services:
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
container_name: namenode
restart: always
ports:
- 9870:9870
- 9000:9000
environment:
- CLUSTER_NAME=hadoop_cluster
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
volumes:
- hadoop_namenode:/hadoop/dfs/name
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9870"]
interval: 30s
timeout: 10s
retries: 3
datanode:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
container_name: datanode
restart: always
ports:
- 9864:9864
environment:
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
volumes:
- hadoop_datanode:/hadoop/dfs/data
depends_on:
- namenode
resourcemanager:
image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8
container_name: resourcemanager
restart: always
ports:
- 8088:8088
environment:
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
- YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
depends_on:
- namenode
nodemanager:
image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8
container_name: nodemanager
restart: always
ports:
- 8042:8042
environment:
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
- YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
- YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle
depends_on:
- resourcemanager
spark-master:
image: bitnami/spark:3.5
container_name: spark-master
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
ports:
- "8080:8080"
- "7077:7077"
volumes:
- spark_data:/bitnami/spark
spark-worker:
image: bitnami/spark:3.5
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_CORES=1
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
# ports:
# - "8081-8084:8081"
volumes:
- spark_data:/bitnami/spark
depends_on:
- spark-master
deploy:
replicas: 2
jupyter:
image: jupyter/pyspark-notebook:latest
container_name: jupyter
ports:
- "8888:8888"
environment:
- JUPYTER_ENABLE_LAB=yes
- SPARK_MASTER=spark://spark-master:7077
volumes:
- jupyter_data:/home/jovyan/work
- ./notebooks:/home/jovyan/work/notebooks
depends_on:
- spark-master
volumes:
hadoop_namenode:
hadoop_datanode:
spark_data:
jupyter_data:
It's fine when I try to create spark dataframe then write to hdfs and read back from hdfs via spark-shell
First all, I create a directory test in hdfs then set permissions to 777 access
hdfs command
hdfs dfs -mkdir /test/employees
hdfs dfs -chmod -R 777 /test
spark-shell
// First, create a dummy dataframe
val data = Seq(
(1, "John", 30, "New York"),
(2, "Alice", 25, "San Francisco"),
(3, "Bob", 35, "Chicago"),
(4, "Carol", 28, "Boston"),
(5, "David", 40, "Seattle")
)
// Define the schema
val columns = Seq("id", "name", "age", "city")
// Create the DataFrame
val df = spark.createDataFrame(data).toDF(columns: _*)
// Show the DataFrame
df.show()
// Write to HDFS
df.write
.mode("overwrite")
.parquet("hdfs://namenode:9000/test/employees")
// Read back from HDFS
val dfRead = spark.read
.parquet("hdfs://namenode:9000/test/employees")
// Show the read DataFrame
println("\nData read back from HDFS:")
dfRead.show()
// Perform some basic analysis
println("\nBasic statistics:")
dfRead.describe("age").show()
println("\nCount by city:")
dfRead.groupBy("city").count().show()
But When I try to do same thing in jupyter it can not create spark dataframe even though spark session successfully connected.
Install pyspark package
pip install pyspark
jupyter notebook
from pyspark.sql import SparkSession
# Create Spark session with proper configuration
spark = SparkSession.builder \
.appName("JupyterTest") \
.master("spark://spark-master:7077") \
.config("spark.driver.host", "jupyter") \
.config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
.getOrCreate()
# Create test DataFrame
data = [("John", 30), ("Alice", 25), ("Bob", 35)]
df = spark.createDataFrame(data, ["name", "age"])
# Show DataFrame
print("Original DataFrame:")
df.show()
# Write to HDFS
df.write.mode("overwrite").parquet("hdfs://namenode:9000/test/people")
# Read back from HDFS
df_read = spark.read.parquet("hdfs://namenode:9000/test/people")
print("\nData read from HDFS:")
df_read.show()
Is something misssing about my config spark session?
Upvotes: 0
Views: 24