Got to dump the file locally
This commit is contained in:
parent
ea14f8c87e
commit
ff781b6b9c
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -160,4 +160,6 @@ cython_debug/
|
|||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
.DS_Store
|
||||
.DS_Store
|
||||
|
||||
test/data/output/*
|
|
@ -7,4 +7,5 @@ s3fs
|
|||
aiofiles
|
||||
duckdb
|
||||
polars
|
||||
pyarrow
|
||||
pyarrow
|
||||
xlsx2csv
|
75
src/hellocomputer/analytics.py
Normal file
75
src/hellocomputer/analytics.py
Normal file
|
@ -0,0 +1,75 @@
|
|||
import duckdb
|
||||
|
||||
|
||||
class DDB:
|
||||
def __init__(self):
|
||||
self.db = duckdb.connect()
|
||||
self.db.install_extension("spatial")
|
||||
self.db.install_extension("httpfs")
|
||||
self.db.load_extension("spatial")
|
||||
self.db.load_extension("httpfs")
|
||||
self.sheets = tuple()
|
||||
self.path = ""
|
||||
|
||||
def gcs_secret(self, gcs_access: str, gcs_secret: str):
|
||||
self.db.sql(f"""
|
||||
CREATE SECRET (
|
||||
TYPE GCS,
|
||||
KEY_ID '{gcs_access}',
|
||||
SECRET '{gcs_secret}')
|
||||
""")
|
||||
|
||||
return self
|
||||
|
||||
def load_metadata(self, path: str = ""):
|
||||
"""For some reason, the header is not loaded"""
|
||||
self.db.sql(f"""
|
||||
create table metadata as (
|
||||
select
|
||||
*
|
||||
from
|
||||
st_read('{path}',
|
||||
layer='metadata'
|
||||
)
|
||||
)""")
|
||||
self.sheets = tuple(
|
||||
self.db.query("select Field2 from metadata where Field1 = 'Sheets'")
|
||||
.fetchall()[0][0]
|
||||
.split(",")
|
||||
)
|
||||
self.path = path
|
||||
|
||||
return self
|
||||
|
||||
def dump_local(self, path):
|
||||
# TODO: Port to fsspec and have a single dump file
|
||||
self.db.query(f"copy (select * from metadata) to '{path}/metadata.csv'")
|
||||
|
||||
for sheet in self.sheets:
|
||||
self.db.query(f"""
|
||||
copy
|
||||
(
|
||||
select
|
||||
*
|
||||
from
|
||||
st_read
|
||||
(
|
||||
'{self.path}',
|
||||
layer = '{sheet}'
|
||||
)
|
||||
)
|
||||
to '{path}/{sheet}.csv'
|
||||
""")
|
||||
return self
|
||||
|
||||
def dump_gcs(self, bucketname, sid):
|
||||
self.db.sql(f"""
|
||||
copy
|
||||
data
|
||||
to
|
||||
'gcs://{bucketname}/{sid}/data.csv';
|
||||
""")
|
||||
return self
|
||||
|
||||
def query(self, sql):
|
||||
return self.db.query(sql)
|
|
@ -1,11 +1,10 @@
|
|||
import aiofiles
|
||||
import duckdb
|
||||
import polars as pl
|
||||
import s3fs
|
||||
from fastapi import APIRouter, File, UploadFile
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from ..config import settings
|
||||
from ..analytics import DDB
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
@ -28,57 +27,14 @@ async def upload_file(file: UploadFile = File(...), sid: str = ""):
|
|||
|
||||
gcs.makedir(f"{settings.gcs_bucketname}/{sid}")
|
||||
|
||||
db = duckdb.connect()
|
||||
db.install_extension("spatial")
|
||||
db.install_extension("httpfs")
|
||||
db.load_extension("httpfs")
|
||||
db.load_extension("spatial")
|
||||
|
||||
db.sql(f"""
|
||||
CREATE SECRET (
|
||||
TYPE GCS,
|
||||
KEY_ID '{settings.gcs_access}',
|
||||
SECRET '{settings.gcs_secret}')
|
||||
""")
|
||||
|
||||
db.sql(f"""
|
||||
create table metadata as (
|
||||
select
|
||||
*
|
||||
from
|
||||
st_read('{f.name}',
|
||||
layer='metadata',
|
||||
open_options=['HEADERS_FORCE', 'FIELD_TYPES=auto']
|
||||
)
|
||||
)""")
|
||||
|
||||
metadata = db.query("select * from metadata").pl()
|
||||
sheets = metadata.select(pl.col("Key") == "Sheets")
|
||||
print(sheets)
|
||||
|
||||
for sheet in sheets.to_dict():
|
||||
print(sheet)
|
||||
|
||||
db.sql(
|
||||
f"""
|
||||
create table data as (
|
||||
select
|
||||
*
|
||||
from
|
||||
st_read('{f.name}',
|
||||
layer='data',
|
||||
open_options=['HEADERS_FORCE', 'FIELD_TYPES=auto']
|
||||
)
|
||||
)"""
|
||||
(
|
||||
DDB()
|
||||
.gcs_secret(settings.gcs_secret, settings.gcs_secret)
|
||||
.load_metadata(f.name)
|
||||
.load_data()
|
||||
.save_gcs(settings.gcs_bucketname, sid)
|
||||
)
|
||||
|
||||
db.sql(f"""
|
||||
copy
|
||||
data
|
||||
to
|
||||
'gcs://{settings.gcs_bucketname}/{sid}/data.csv';
|
||||
""")
|
||||
|
||||
return JSONResponse(
|
||||
content={"message": "File uploaded successfully"}, status_code=200
|
||||
)
|
||||
|
|
BIN
test/data/TestExcelHelloComputer.xlsx
Normal file
BIN
test/data/TestExcelHelloComputer.xlsx
Normal file
Binary file not shown.
1
test/output/.gitignore
vendored
Normal file
1
test/output/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
*.csv
|
17
test/test_load.py
Normal file
17
test/test_load.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
import hellocomputer
|
||||
from hellocomputer.analytics import DDB
|
||||
from pathlib import Path
|
||||
|
||||
TEST_DATA_FOLDER = Path(hellocomputer.__file__).parents[2] / "test" / "data"
|
||||
TEST_OUTPUT_FOLDER = Path(hellocomputer.__file__).parents[2] / "test" / "output"
|
||||
|
||||
|
||||
def test_load_data():
|
||||
db = (
|
||||
DDB()
|
||||
.load_metadata(TEST_DATA_FOLDER / "TestExcelHelloComputer.xlsx")
|
||||
.dump_local(TEST_OUTPUT_FOLDER)
|
||||
)
|
||||
|
||||
assert db.sheets == ("answers",)
|
||||
assert (TEST_OUTPUT_FOLDER / "answers.csv").exists()
|
Loading…
Reference in a new issue