From 4113cbb4eeda25f815fb11cc03bc10ca78969d9a Mon Sep 17 00:00:00 2001 From: Guillem Borrell Nogueras Date: Sun, 19 Feb 2023 08:44:16 +0100 Subject: [PATCH] Add 'The simplest and smallest spark cluster you can build' --- ...nd-smallest-spark-cluster-you-can-build.md | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 The-simplest-and-smallest-spark-cluster-you-can-build.md diff --git a/The-simplest-and-smallest-spark-cluster-you-can-build.md b/The-simplest-and-smallest-spark-cluster-you-can-build.md new file mode 100644 index 0000000..36a50dd --- /dev/null +++ b/The-simplest-and-smallest-spark-cluster-you-can-build.md @@ -0,0 +1,80 @@ +If you want to get up and running with Spark with almost no time you can let the Spark context create a local cluster. But deploying a minimal setup is also interesting. It helps you understanding which are the most common issues of actual environments, and showcases the performance issues you may have. + +Spark is a service, and you have to connect it with something else. In my case, I tend to connect it with Jupyter. + +The fastest and simplest way I've found to deploy a Spark cluster is with Docker compose. + + +``` +version: "3" + +networks: + jupyterhub: + external: false + +services: + jupyterhub: + restart: always + build: + context: . + dockerfile: Dockerfile + container_name: jupyterhub + extra_hosts: + - host.docker.internal:host-gateway + volumes: + - "/var/run/docker.sock:/var/run/docker.sock:rw" + - "./data:/data" + - "./jupyterhub_config.py:/srv/jupyterhub/jupyterhub_config.py" + networks: + - jupyterhub + ports: + - "8000:8000" + environment: + DOCKER_NETWORK_NAME: lab_jupyterhub + command: > + jupyterhub -f /srv/jupyterhub/jupyterhub_config.py + spark: + image: docker.io/bitnami/spark:3.3 + extra_hosts: + - host.docker.internal:host-gateway + environment: + - SPARK_MODE=master + - SPARK_RPC_AUTHENTICATION_ENABLED=no + - SPARK_RPC_ENCRYPTION_ENABLED=no + - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no + - SPARK_SSL_ENABLED=no + networks: + - jupyterhub + ports: + - '8080:8080' + spark-worker: + image: docker.io/bitnami/spark:3.3 + extra_hosts: + - host.docker.internal:host-gateway + deploy: + replicas: 4 + environment: + - SPARK_MODE=worker + - SPARK_MASTER_URL=spark://spark:7077 + - SPARK_WORKER_MEMORY=1G + - SPARK_WORKER_CORES=1 + - SPARK_RPC_AUTHENTICATION_ENABLED=no + - SPARK_RPC_ENCRYPTION_ENABLED=no + - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no + - SPARK_SSL_ENABLED=no + networks: + - jupyterhub + ``` +Note that the only hard piece is the name of the network to connect the jupyter server and the hub, and how you must take into account the project's prefix. + +I'm running jupyterlab with the Docker spawner, and I'd recommend you to do the same thing. This means that you have to build a container image for the jupyter server, and make sure that you use the same versions of Spark everywhere. + +Multi-stage builds to the rescue! + +``` +FROM docker.io/bitnami/spark:3.3 as spark +FROM jupyter/all-spark-notebook +USER root +COPY --from=spark /opt/bitnami/spark /usr/local/spark +USER jovyan +```