Gesang Wibawono
Gesang Wibawono

Reputation: 1

Connect spark session on Jupyter

docker-compose.yml

services:
  namenode:
    image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
    container_name: namenode
    restart: always
    ports:
      - 9870:9870
      - 9000:9000
    environment:
      - CLUSTER_NAME=hadoop_cluster
      - CORE_CONF_fs_defaultFS=hdfs://namenode:9000
    volumes:
      - hadoop_namenode:/hadoop/dfs/name
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9870"]
      interval: 30s
      timeout: 10s
      retries: 3

  datanode:
    image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
    container_name: datanode
    restart: always
    ports:
      - 9864:9864
    environment:
      - CORE_CONF_fs_defaultFS=hdfs://namenode:9000
    volumes:
      - hadoop_datanode:/hadoop/dfs/data
    depends_on:
      - namenode

  resourcemanager:
    image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8
    container_name: resourcemanager
    restart: always
    ports:
      - 8088:8088
    environment:
      - CORE_CONF_fs_defaultFS=hdfs://namenode:9000
      - YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
    depends_on:
      - namenode

  nodemanager:
    image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8
    container_name: nodemanager
    restart: always
    ports:
      - 8042:8042
    environment:
      - CORE_CONF_fs_defaultFS=hdfs://namenode:9000
      - YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
      - YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle
    depends_on:
      - resourcemanager

  spark-master:
    image: bitnami/spark:3.5
    container_name: spark-master
    environment:
      - SPARK_MODE=master
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
    ports:
      - "8080:8080"
      - "7077:7077"
    volumes:
      - spark_data:/bitnami/spark

  spark-worker:
    image: bitnami/spark:3.5
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_URL=spark://spark-master:7077
      - SPARK_WORKER_MEMORY=1G
      - SPARK_WORKER_CORES=1
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
    # ports:
    #   - "8081-8084:8081"
    volumes:
      - spark_data:/bitnami/spark
    depends_on:
      - spark-master
    deploy:
      replicas: 2

  jupyter:
    image: jupyter/pyspark-notebook:latest
    container_name: jupyter
    ports:
      - "8888:8888"
    environment:
      - JUPYTER_ENABLE_LAB=yes
      - SPARK_MASTER=spark://spark-master:7077
    volumes:
      - jupyter_data:/home/jovyan/work
      - ./notebooks:/home/jovyan/work/notebooks
    depends_on:
      - spark-master

volumes:
  hadoop_namenode:
  hadoop_datanode:
  spark_data:
  jupyter_data:

It's fine when I try to create spark dataframe then write to hdfs and read back from hdfs via spark-shell

First all, I create a directory test in hdfs then set permissions to 777 access

hdfs command

hdfs dfs -mkdir /test/employees
hdfs dfs -chmod -R 777 /test

spark-shell

// First, create a dummy dataframe
val data = Seq(
  (1, "John", 30, "New York"),
  (2, "Alice", 25, "San Francisco"),
  (3, "Bob", 35, "Chicago"),
  (4, "Carol", 28, "Boston"),
  (5, "David", 40, "Seattle")
)

// Define the schema
val columns = Seq("id", "name", "age", "city")

// Create the DataFrame
val df = spark.createDataFrame(data).toDF(columns: _*)

// Show the DataFrame
df.show()

// Write to HDFS
df.write
  .mode("overwrite")
  .parquet("hdfs://namenode:9000/test/employees")

// Read back from HDFS
val dfRead = spark.read
  .parquet("hdfs://namenode:9000/test/employees")

// Show the read DataFrame
println("\nData read back from HDFS:")
dfRead.show()

// Perform some basic analysis
println("\nBasic statistics:")
dfRead.describe("age").show()

println("\nCount by city:")
dfRead.groupBy("city").count().show()

But When I try to do same thing in jupyter it can not create spark dataframe even though spark session successfully connected.

Install pyspark package pip install pyspark

jupyter notebook

from pyspark.sql import SparkSession

# Create Spark session with proper configuration
spark = SparkSession.builder \
    .appName("JupyterTest") \
    .master("spark://spark-master:7077") \
    .config("spark.driver.host", "jupyter") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

# Create test DataFrame
data = [("John", 30), ("Alice", 25), ("Bob", 35)]
df = spark.createDataFrame(data, ["name", "age"])

# Show DataFrame
print("Original DataFrame:")
df.show()

# Write to HDFS
df.write.mode("overwrite").parquet("hdfs://namenode:9000/test/people")

# Read back from HDFS
df_read = spark.read.parquet("hdfs://namenode:9000/test/people")
print("\nData read from HDFS:")
df_read.show()

Is something misssing about my config spark session?

Upvotes: 0

Views: 24

Answers (0)

Related Questions