Update 'The simplest and smallest spark cluster you can build'
parent
4113cbb4ee
commit
c655fd590b
|
@ -78,3 +78,47 @@ USER root
|
||||||
COPY --from=spark /opt/bitnami/spark /usr/local/spark
|
COPY --from=spark /opt/bitnami/spark /usr/local/spark
|
||||||
USER jovyan
|
USER jovyan
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can also run minio in the same docker compose project, and connect it to Spark.
|
||||||
|
|
||||||
|
```
|
||||||
|
from pyspark import SparkConf
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
import os
|
||||||
|
|
||||||
|
conf = (
|
||||||
|
SparkConf()
|
||||||
|
.setAppName("Spark minIO Test")
|
||||||
|
.set("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
|
||||||
|
.set("spark.hadoop.fs.s3a.access.key", os.environ.get("AWS_ACCESS_KEY_ID"))
|
||||||
|
.set("spark.hadoop.fs.s3a.secret.key", os.environ.get("AWS_SECRET_ACCESS_KEY"))
|
||||||
|
.set("spark.hadoop.fs.s3a.path.style.access", True)
|
||||||
|
.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
|
||||||
|
)
|
||||||
|
|
||||||
|
spark = (
|
||||||
|
SparkSession
|
||||||
|
.builder
|
||||||
|
.master("spark://spark:7077")
|
||||||
|
.config(conf=conf)
|
||||||
|
.getOrCreate()
|
||||||
|
)
|
||||||
|
|
||||||
|
(
|
||||||
|
spark
|
||||||
|
.read
|
||||||
|
.options(inferSchema=True, header=True)
|
||||||
|
.csv("s3a://datawarehouse/boards/*/*.csv")
|
||||||
|
.createOrReplaceTempView("boards")
|
||||||
|
)
|
||||||
|
|
||||||
|
(
|
||||||
|
spark
|
||||||
|
.sql("""
|
||||||
|
select count(*)
|
||||||
|
from boards
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
.toPandas()
|
||||||
|
)
|
||||||
|
```
|
Loading…
Reference in a new issue