Got to dump the file locally
This commit is contained in:
		
							parent
							
								
									ea14f8c87e
								
							
						
					
					
						commit
						ff781b6b9c
					
				
							
								
								
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							|  | @ -160,4 +160,6 @@ cython_debug/ | |||
| #  option (not recommended) you can uncomment the following to ignore the entire idea folder. | ||||
| #.idea/ | ||||
| 
 | ||||
| .DS_Store | ||||
| .DS_Store | ||||
| 
 | ||||
| test/data/output/* | ||||
|  | @ -7,4 +7,5 @@ s3fs | |||
| aiofiles | ||||
| duckdb | ||||
| polars | ||||
| pyarrow | ||||
| pyarrow | ||||
| xlsx2csv | ||||
							
								
								
									
										75
									
								
								src/hellocomputer/analytics.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								src/hellocomputer/analytics.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,75 @@ | |||
| import duckdb | ||||
| 
 | ||||
| 
 | ||||
| class DDB: | ||||
|     def __init__(self): | ||||
|         self.db = duckdb.connect() | ||||
|         self.db.install_extension("spatial") | ||||
|         self.db.install_extension("httpfs") | ||||
|         self.db.load_extension("spatial") | ||||
|         self.db.load_extension("httpfs") | ||||
|         self.sheets = tuple() | ||||
|         self.path = "" | ||||
| 
 | ||||
|     def gcs_secret(self, gcs_access: str, gcs_secret: str): | ||||
|         self.db.sql(f""" | ||||
|             CREATE SECRET ( | ||||
|                TYPE GCS, | ||||
|                KEY_ID '{gcs_access}', | ||||
|                SECRET '{gcs_secret}') | ||||
|                """) | ||||
| 
 | ||||
|         return self | ||||
| 
 | ||||
|     def load_metadata(self, path: str = ""): | ||||
|         """For some reason, the header is not loaded""" | ||||
|         self.db.sql(f""" | ||||
|             create table metadata as ( | ||||
|             select | ||||
|                 * | ||||
|             from | ||||
|                 st_read('{path}',  | ||||
|                         layer='metadata' | ||||
|                         ) | ||||
|             )""") | ||||
|         self.sheets = tuple( | ||||
|             self.db.query("select Field2 from metadata where Field1 = 'Sheets'") | ||||
|             .fetchall()[0][0] | ||||
|             .split(",") | ||||
|         ) | ||||
|         self.path = path | ||||
| 
 | ||||
|         return self | ||||
| 
 | ||||
|     def dump_local(self, path): | ||||
|         # TODO: Port to fsspec and have a single dump file | ||||
|         self.db.query(f"copy (select * from metadata) to '{path}/metadata.csv'") | ||||
| 
 | ||||
|         for sheet in self.sheets: | ||||
|             self.db.query(f""" | ||||
|             copy  | ||||
|                 ( | ||||
|                 select | ||||
|                     * | ||||
|                 from | ||||
|                     st_read | ||||
|                         ( | ||||
|                         '{self.path}', | ||||
|                         layer = '{sheet}' | ||||
|                         ) | ||||
|                 ) | ||||
|             to '{path}/{sheet}.csv' | ||||
|                           """) | ||||
|         return self | ||||
| 
 | ||||
|     def dump_gcs(self, bucketname, sid): | ||||
|         self.db.sql(f""" | ||||
|             copy | ||||
|                 data | ||||
|             to | ||||
|                 'gcs://{bucketname}/{sid}/data.csv'; | ||||
|             """) | ||||
|         return self | ||||
| 
 | ||||
|     def query(self, sql): | ||||
|         return self.db.query(sql) | ||||
|  | @ -1,11 +1,10 @@ | |||
| import aiofiles | ||||
| import duckdb | ||||
| import polars as pl | ||||
| import s3fs | ||||
| from fastapi import APIRouter, File, UploadFile | ||||
| from fastapi.responses import JSONResponse | ||||
| 
 | ||||
| from ..config import settings | ||||
| from ..analytics import DDB | ||||
| 
 | ||||
| router = APIRouter() | ||||
| 
 | ||||
|  | @ -28,57 +27,14 @@ async def upload_file(file: UploadFile = File(...), sid: str = ""): | |||
| 
 | ||||
|         gcs.makedir(f"{settings.gcs_bucketname}/{sid}") | ||||
| 
 | ||||
|         db = duckdb.connect() | ||||
|         db.install_extension("spatial") | ||||
|         db.install_extension("httpfs") | ||||
|         db.load_extension("httpfs") | ||||
|         db.load_extension("spatial") | ||||
| 
 | ||||
|         db.sql(f""" | ||||
|             CREATE SECRET ( | ||||
|                TYPE GCS, | ||||
|                KEY_ID '{settings.gcs_access}', | ||||
|                SECRET '{settings.gcs_secret}') | ||||
|                """) | ||||
| 
 | ||||
|         db.sql(f""" | ||||
|             create table metadata as ( | ||||
|             select | ||||
|                 * | ||||
|             from | ||||
|                 st_read('{f.name}',  | ||||
|                         layer='metadata', | ||||
|                         open_options=['HEADERS_FORCE', 'FIELD_TYPES=auto'] | ||||
|                         ) | ||||
|             )""") | ||||
| 
 | ||||
|         metadata = db.query("select * from metadata").pl() | ||||
|         sheets = metadata.select(pl.col("Key") == "Sheets") | ||||
|         print(sheets) | ||||
| 
 | ||||
|         for sheet in sheets.to_dict(): | ||||
|             print(sheet) | ||||
| 
 | ||||
|         db.sql( | ||||
|             f""" | ||||
|             create table data as ( | ||||
|             select | ||||
|                 * | ||||
|             from | ||||
|                 st_read('{f.name}',  | ||||
|                         layer='data', | ||||
|                         open_options=['HEADERS_FORCE', 'FIELD_TYPES=auto'] | ||||
|                         ) | ||||
|             )""" | ||||
|         ( | ||||
|             DDB() | ||||
|             .gcs_secret(settings.gcs_secret, settings.gcs_secret) | ||||
|             .load_metadata(f.name) | ||||
|             .load_data() | ||||
|             .save_gcs(settings.gcs_bucketname, sid) | ||||
|         ) | ||||
| 
 | ||||
|         db.sql(f""" | ||||
|             copy | ||||
|                 data | ||||
|             to | ||||
|                 'gcs://{settings.gcs_bucketname}/{sid}/data.csv'; | ||||
|             """) | ||||
| 
 | ||||
|         return JSONResponse( | ||||
|             content={"message": "File uploaded successfully"}, status_code=200 | ||||
|         ) | ||||
|  |  | |||
							
								
								
									
										
											BIN
										
									
								
								test/data/TestExcelHelloComputer.xlsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								test/data/TestExcelHelloComputer.xlsx
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										1
									
								
								test/output/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								test/output/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | |||
| *.csv | ||||
							
								
								
									
										17
									
								
								test/test_load.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								test/test_load.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,17 @@ | |||
| import hellocomputer | ||||
| from hellocomputer.analytics import DDB | ||||
| from pathlib import Path | ||||
| 
 | ||||
| TEST_DATA_FOLDER = Path(hellocomputer.__file__).parents[2] / "test" / "data" | ||||
| TEST_OUTPUT_FOLDER = Path(hellocomputer.__file__).parents[2] / "test" / "output" | ||||
| 
 | ||||
| 
 | ||||
| def test_load_data(): | ||||
|     db = ( | ||||
|         DDB() | ||||
|         .load_metadata(TEST_DATA_FOLDER / "TestExcelHelloComputer.xlsx") | ||||
|         .dump_local(TEST_OUTPUT_FOLDER) | ||||
|     ) | ||||
| 
 | ||||
|     assert db.sheets == ("answers",) | ||||
|     assert (TEST_OUTPUT_FOLDER / "answers.csv").exists() | ||||
		Loading…
	
		Reference in a new issue