From 4113cbb4eeda25f815fb11cc03bc10ca78969d9a Mon Sep 17 00:00:00 2001
From: Guillem Borrell Nogueras <guillem@noreply.localhost>
Date: Sun, 19 Feb 2023 08:44:16 +0100
Subject: [PATCH] Add 'The simplest and smallest spark cluster you can build'

---
 ...nd-smallest-spark-cluster-you-can-build.md | 80 +++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 The-simplest-and-smallest-spark-cluster-you-can-build.md

diff --git a/The-simplest-and-smallest-spark-cluster-you-can-build.md b/The-simplest-and-smallest-spark-cluster-you-can-build.md
new file mode 100644
index 0000000..36a50dd
--- /dev/null
+++ b/The-simplest-and-smallest-spark-cluster-you-can-build.md
@@ -0,0 +1,80 @@
+If you want to get up and running with Spark with almost no time you can let the Spark context create a local cluster. But deploying a minimal setup is also interesting. It helps you understanding which are the most common issues of actual environments, and showcases the performance issues you may have.
+
+Spark is a service, and you have to connect it with something else. In my case, I tend to connect it with Jupyter.
+
+The fastest and simplest way I've found to deploy a Spark cluster is with Docker compose.
+
+
+```
+version: "3"
+
+networks:
+  jupyterhub:
+    external: false
+
+services:
+  jupyterhub:
+    restart: always
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: jupyterhub
+    extra_hosts:
+      - host.docker.internal:host-gateway
+    volumes:
+      - "/var/run/docker.sock:/var/run/docker.sock:rw"
+      - "./data:/data"
+      - "./jupyterhub_config.py:/srv/jupyterhub/jupyterhub_config.py"
+    networks:
+      - jupyterhub
+    ports:
+      - "8000:8000"
+    environment:
+      DOCKER_NETWORK_NAME: lab_jupyterhub
+    command: >
+      jupyterhub -f /srv/jupyterhub/jupyterhub_config.py
+  spark:
+    image: docker.io/bitnami/spark:3.3
+    extra_hosts:
+      - host.docker.internal:host-gateway
+    environment:
+      - SPARK_MODE=master
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+    networks:
+      - jupyterhub
+    ports:
+      - '8080:8080'
+  spark-worker:
+    image: docker.io/bitnami/spark:3.3
+    extra_hosts:
+      - host.docker.internal:host-gateway
+    deploy:
+      replicas: 4
+    environment:
+      - SPARK_MODE=worker
+      - SPARK_MASTER_URL=spark://spark:7077
+      - SPARK_WORKER_MEMORY=1G
+      - SPARK_WORKER_CORES=1
+      - SPARK_RPC_AUTHENTICATION_ENABLED=no
+      - SPARK_RPC_ENCRYPTION_ENABLED=no
+      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
+      - SPARK_SSL_ENABLED=no
+    networks:
+      - jupyterhub
+  ```
+Note that the only hard piece is the name of the network to connect the jupyter server and the hub, and how you must take into account the project's prefix.
+
+I'm running jupyterlab with the Docker spawner, and I'd recommend you to do the same thing. This means that you have to build a container image for the jupyter server, and make sure that you use the same versions of Spark everywhere.
+
+Multi-stage builds to the rescue!
+
+```
+FROM docker.io/bitnami/spark:3.3 as spark
+FROM jupyter/all-spark-notebook
+USER root
+COPY --from=spark /opt/bitnami/spark /usr/local/spark
+USER jovyan
+```