Update 'The simplest and smallest spark cluster you can build'
parent
4113cbb4ee
commit
c655fd590b
|
@ -78,3 +78,47 @@ USER root
|
|||
COPY --from=spark /opt/bitnami/spark /usr/local/spark
|
||||
USER jovyan
|
||||
```
|
||||
|
||||
You can also run minio in the same docker compose project, and connect it to Spark.
|
||||
|
||||
```
|
||||
from pyspark import SparkConf
|
||||
from pyspark.sql import SparkSession
|
||||
import os
|
||||
|
||||
conf = (
|
||||
SparkConf()
|
||||
.setAppName("Spark minIO Test")
|
||||
.set("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
|
||||
.set("spark.hadoop.fs.s3a.access.key", os.environ.get("AWS_ACCESS_KEY_ID"))
|
||||
.set("spark.hadoop.fs.s3a.secret.key", os.environ.get("AWS_SECRET_ACCESS_KEY"))
|
||||
.set("spark.hadoop.fs.s3a.path.style.access", True)
|
||||
.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
|
||||
)
|
||||
|
||||
spark = (
|
||||
SparkSession
|
||||
.builder
|
||||
.master("spark://spark:7077")
|
||||
.config(conf=conf)
|
||||
.getOrCreate()
|
||||
)
|
||||
|
||||
(
|
||||
spark
|
||||
.read
|
||||
.options(inferSchema=True, header=True)
|
||||
.csv("s3a://datawarehouse/boards/*/*.csv")
|
||||
.createOrReplaceTempView("boards")
|
||||
)
|
||||
|
||||
(
|
||||
spark
|
||||
.sql("""
|
||||
select count(*)
|
||||
from boards
|
||||
"""
|
||||
)
|
||||
.toPandas()
|
||||
)
|
||||
```
|
Loading…
Reference in a new issue