Update 'The simplest and smallest spark cluster you can build'

Guillem Borrell Nogueras 2023-02-19 09:01:47 +01:00
parent 4113cbb4ee
commit c655fd590b

@ -78,3 +78,47 @@ USER root
COPY --from=spark /opt/bitnami/spark /usr/local/spark
USER jovyan
```
You can also run minio in the same docker compose project, and connect it to Spark.
```
from pyspark import SparkConf
from pyspark.sql import SparkSession
import os
conf = (
SparkConf()
.setAppName("Spark minIO Test")
.set("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
.set("spark.hadoop.fs.s3a.access.key", os.environ.get("AWS_ACCESS_KEY_ID"))
.set("spark.hadoop.fs.s3a.secret.key", os.environ.get("AWS_SECRET_ACCESS_KEY"))
.set("spark.hadoop.fs.s3a.path.style.access", True)
.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
)
spark = (
SparkSession
.builder
.master("spark://spark:7077")
.config(conf=conf)
.getOrCreate()
)
(
spark
.read
.options(inferSchema=True, header=True)
.csv("s3a://datawarehouse/boards/*/*.csv")
.createOrReplaceTempView("boards")
)
(
spark
.sql("""
select count(*)
from boards
"""
)
.toPandas()
)
```