diff --git a/.gitignore b/.gitignore index f22ff58..7df86d7 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,6 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -.DS_Store \ No newline at end of file +.DS_Store + +test/data/output/* \ No newline at end of file diff --git a/requirements.in b/requirements.in index 5395014..ea4a93b 100644 --- a/requirements.in +++ b/requirements.in @@ -7,4 +7,5 @@ s3fs aiofiles duckdb polars -pyarrow \ No newline at end of file +pyarrow +xlsx2csv \ No newline at end of file diff --git a/src/hellocomputer/analytics.py b/src/hellocomputer/analytics.py new file mode 100644 index 0000000..4ecfc8e --- /dev/null +++ b/src/hellocomputer/analytics.py @@ -0,0 +1,75 @@ +import duckdb + + +class DDB: + def __init__(self): + self.db = duckdb.connect() + self.db.install_extension("spatial") + self.db.install_extension("httpfs") + self.db.load_extension("spatial") + self.db.load_extension("httpfs") + self.sheets = tuple() + self.path = "" + + def gcs_secret(self, gcs_access: str, gcs_secret: str): + self.db.sql(f""" + CREATE SECRET ( + TYPE GCS, + KEY_ID '{gcs_access}', + SECRET '{gcs_secret}') + """) + + return self + + def load_metadata(self, path: str = ""): + """For some reason, the header is not loaded""" + self.db.sql(f""" + create table metadata as ( + select + * + from + st_read('{path}', + layer='metadata' + ) + )""") + self.sheets = tuple( + self.db.query("select Field2 from metadata where Field1 = 'Sheets'") + .fetchall()[0][0] + .split(",") + ) + self.path = path + + return self + + def dump_local(self, path): + # TODO: Port to fsspec and have a single dump file + self.db.query(f"copy (select * from metadata) to '{path}/metadata.csv'") + + for sheet in self.sheets: + self.db.query(f""" + copy + ( + select + * + from + st_read + ( + '{self.path}', + layer = '{sheet}' + ) + ) + to '{path}/{sheet}.csv' + """) + return self + + def dump_gcs(self, bucketname, sid): + self.db.sql(f""" + copy + data + to + 'gcs://{bucketname}/{sid}/data.csv'; + """) + return self + + def query(self, sql): + return self.db.query(sql) diff --git a/src/hellocomputer/routers/files.py b/src/hellocomputer/routers/files.py index 39999e7..cdc7327 100644 --- a/src/hellocomputer/routers/files.py +++ b/src/hellocomputer/routers/files.py @@ -1,11 +1,10 @@ import aiofiles -import duckdb -import polars as pl import s3fs from fastapi import APIRouter, File, UploadFile from fastapi.responses import JSONResponse from ..config import settings +from ..analytics import DDB router = APIRouter() @@ -28,57 +27,14 @@ async def upload_file(file: UploadFile = File(...), sid: str = ""): gcs.makedir(f"{settings.gcs_bucketname}/{sid}") - db = duckdb.connect() - db.install_extension("spatial") - db.install_extension("httpfs") - db.load_extension("httpfs") - db.load_extension("spatial") - - db.sql(f""" - CREATE SECRET ( - TYPE GCS, - KEY_ID '{settings.gcs_access}', - SECRET '{settings.gcs_secret}') - """) - - db.sql(f""" - create table metadata as ( - select - * - from - st_read('{f.name}', - layer='metadata', - open_options=['HEADERS_FORCE', 'FIELD_TYPES=auto'] - ) - )""") - - metadata = db.query("select * from metadata").pl() - sheets = metadata.select(pl.col("Key") == "Sheets") - print(sheets) - - for sheet in sheets.to_dict(): - print(sheet) - - db.sql( - f""" - create table data as ( - select - * - from - st_read('{f.name}', - layer='data', - open_options=['HEADERS_FORCE', 'FIELD_TYPES=auto'] - ) - )""" + ( + DDB() + .gcs_secret(settings.gcs_secret, settings.gcs_secret) + .load_metadata(f.name) + .load_data() + .save_gcs(settings.gcs_bucketname, sid) ) - db.sql(f""" - copy - data - to - 'gcs://{settings.gcs_bucketname}/{sid}/data.csv'; - """) - return JSONResponse( content={"message": "File uploaded successfully"}, status_code=200 ) diff --git a/test/data/TestExcelHelloComputer.xlsx b/test/data/TestExcelHelloComputer.xlsx new file mode 100644 index 0000000..87c760f Binary files /dev/null and b/test/data/TestExcelHelloComputer.xlsx differ diff --git a/test/output/.gitignore b/test/output/.gitignore new file mode 100644 index 0000000..16f2dc5 --- /dev/null +++ b/test/output/.gitignore @@ -0,0 +1 @@ +*.csv \ No newline at end of file diff --git a/test/test_load.py b/test/test_load.py new file mode 100644 index 0000000..b4d7a27 --- /dev/null +++ b/test/test_load.py @@ -0,0 +1,17 @@ +import hellocomputer +from hellocomputer.analytics import DDB +from pathlib import Path + +TEST_DATA_FOLDER = Path(hellocomputer.__file__).parents[2] / "test" / "data" +TEST_OUTPUT_FOLDER = Path(hellocomputer.__file__).parents[2] / "test" / "output" + + +def test_load_data(): + db = ( + DDB() + .load_metadata(TEST_DATA_FOLDER / "TestExcelHelloComputer.xlsx") + .dump_local(TEST_OUTPUT_FOLDER) + ) + + assert db.sheets == ("answers",) + assert (TEST_OUTPUT_FOLDER / "answers.csv").exists()