diff --git a/The-simplest-and-smallest-spark-cluster-you-can-build.md b/The-simplest-and-smallest-spark-cluster-you-can-build.md index 36a50dd..aca8ac2 100644 --- a/The-simplest-and-smallest-spark-cluster-you-can-build.md +++ b/The-simplest-and-smallest-spark-cluster-you-can-build.md @@ -78,3 +78,47 @@ USER root COPY --from=spark /opt/bitnami/spark /usr/local/spark USER jovyan ``` + +You can also run minio in the same docker compose project, and connect it to Spark. + +``` +from pyspark import SparkConf +from pyspark.sql import SparkSession +import os + +conf = ( + SparkConf() + .setAppName("Spark minIO Test") + .set("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") + .set("spark.hadoop.fs.s3a.access.key", os.environ.get("AWS_ACCESS_KEY_ID")) + .set("spark.hadoop.fs.s3a.secret.key", os.environ.get("AWS_SECRET_ACCESS_KEY")) + .set("spark.hadoop.fs.s3a.path.style.access", True) + .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") +) + +spark = ( + SparkSession + .builder + .master("spark://spark:7077") + .config(conf=conf) + .getOrCreate() +) + +( + spark + .read + .options(inferSchema=True, header=True) + .csv("s3a://datawarehouse/boards/*/*.csv") + .createOrReplaceTempView("boards") +) + +( + spark + .sql(""" +select count(*) +from boards +""" + ) + .toPandas() +) +``` \ No newline at end of file