From ff781b6b9c1663ae631ee841b7200f59a295df86 Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Tue, 21 May 2024 20:50:17 +0100 Subject: [PATCH] Got to dump the file locally --- .gitignore | 4 +- requirements.in | 3 +- src/hellocomputer/analytics.py | 75 ++++++++++++++++++++++++++ src/hellocomputer/routers/files.py | 58 +++----------------- test/data/TestExcelHelloComputer.xlsx | Bin 0 -> 10809 bytes test/output/.gitignore | 1 + test/test_load.py | 17 ++++++ 7 files changed, 105 insertions(+), 53 deletions(-) create mode 100644 src/hellocomputer/analytics.py create mode 100644 test/data/TestExcelHelloComputer.xlsx create mode 100644 test/output/.gitignore create mode 100644 test/test_load.py diff --git a/.gitignore b/.gitignore index f22ff58..7df86d7 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,6 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -.DS_Store \ No newline at end of file +.DS_Store + +test/data/output/* \ No newline at end of file diff --git a/requirements.in b/requirements.in index 5395014..ea4a93b 100644 --- a/requirements.in +++ b/requirements.in @@ -7,4 +7,5 @@ s3fs aiofiles duckdb polars -pyarrow \ No newline at end of file +pyarrow +xlsx2csv \ No newline at end of file diff --git a/src/hellocomputer/analytics.py b/src/hellocomputer/analytics.py new file mode 100644 index 0000000..4ecfc8e --- /dev/null +++ b/src/hellocomputer/analytics.py @@ -0,0 +1,75 @@ +import duckdb + + +class DDB: + def __init__(self): + self.db = duckdb.connect() + self.db.install_extension("spatial") + self.db.install_extension("httpfs") + self.db.load_extension("spatial") + self.db.load_extension("httpfs") + self.sheets = tuple() + self.path = "" + + def gcs_secret(self, gcs_access: str, gcs_secret: str): + self.db.sql(f""" + CREATE SECRET ( + TYPE GCS, + KEY_ID '{gcs_access}', + SECRET '{gcs_secret}') + """) + + return self + + def load_metadata(self, path: str = ""): + """For some reason, the header is not loaded""" + self.db.sql(f""" + create table metadata as ( + select + * + from + st_read('{path}', + layer='metadata' + ) + )""") + self.sheets = tuple( + self.db.query("select Field2 from metadata where Field1 = 'Sheets'") + .fetchall()[0][0] + .split(",") + ) + self.path = path + + return self + + def dump_local(self, path): + # TODO: Port to fsspec and have a single dump file + self.db.query(f"copy (select * from metadata) to '{path}/metadata.csv'") + + for sheet in self.sheets: + self.db.query(f""" + copy + ( + select + * + from + st_read + ( + '{self.path}', + layer = '{sheet}' + ) + ) + to '{path}/{sheet}.csv' + """) + return self + + def dump_gcs(self, bucketname, sid): + self.db.sql(f""" + copy + data + to + 'gcs://{bucketname}/{sid}/data.csv'; + """) + return self + + def query(self, sql): + return self.db.query(sql) diff --git a/src/hellocomputer/routers/files.py b/src/hellocomputer/routers/files.py index 39999e7..cdc7327 100644 --- a/src/hellocomputer/routers/files.py +++ b/src/hellocomputer/routers/files.py @@ -1,11 +1,10 @@ import aiofiles -import duckdb -import polars as pl import s3fs from fastapi import APIRouter, File, UploadFile from fastapi.responses import JSONResponse from ..config import settings +from ..analytics import DDB router = APIRouter() @@ -28,57 +27,14 @@ async def upload_file(file: UploadFile = File(...), sid: str = ""): gcs.makedir(f"{settings.gcs_bucketname}/{sid}") - db = duckdb.connect() - db.install_extension("spatial") - db.install_extension("httpfs") - db.load_extension("httpfs") - db.load_extension("spatial") - - db.sql(f""" - CREATE SECRET ( - TYPE GCS, - KEY_ID '{settings.gcs_access}', - SECRET '{settings.gcs_secret}') - """) - - db.sql(f""" - create table metadata as ( - select - * - from - st_read('{f.name}', - layer='metadata', - open_options=['HEADERS_FORCE', 'FIELD_TYPES=auto'] - ) - )""") - - metadata = db.query("select * from metadata").pl() - sheets = metadata.select(pl.col("Key") == "Sheets") - print(sheets) - - for sheet in sheets.to_dict(): - print(sheet) - - db.sql( - f""" - create table data as ( - select - * - from - st_read('{f.name}', - layer='data', - open_options=['HEADERS_FORCE', 'FIELD_TYPES=auto'] - ) - )""" + ( + DDB() + .gcs_secret(settings.gcs_secret, settings.gcs_secret) + .load_metadata(f.name) + .load_data() + .save_gcs(settings.gcs_bucketname, sid) ) - db.sql(f""" - copy - data - to - 'gcs://{settings.gcs_bucketname}/{sid}/data.csv'; - """) - return JSONResponse( content={"message": "File uploaded successfully"}, status_code=200 ) diff --git a/test/data/TestExcelHelloComputer.xlsx b/test/data/TestExcelHelloComputer.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..87c760f4204351b60b797befaae005f59ae10190 GIT binary patch literal 10809 zcmeHtWmH_-(rx4J!JPm>g9Zs6Bsc_Vf(LgAF2SAP?!kk*yA!PO;O+@&5*!+NopbJY zlfymV8}HZqbIOIhungG%Sm^KnIsS`Bpf6#>wuc>C@*?pXE4ESH z@XafAd_PIb5rX1VI2}(a0`DOsh8wRZh$SVmV#xWnhICoY7jB!W8*VlV)&8N2t!Zba z6wF;Rm38TJhhEn@tJSda_Oe7r*`}nTJQTY67$4ICuaz5m)1EN~d4gW?yg>j8`{re) zMgwU*A{{9Lbwc^Si&eUg6u6@3FsYVyOZC1?2`iB4pH36Dlc6Cp?(gNPIx9&JXY0X5)F-_=*xKxcNIJjH z511M^)$%`fuqcMb4@Q1r02fnMe5|FJvyuhz^}&?NO1=vcTRbt@z~e`}?~C1Eg&7|| zk&88nA|sm~0O(y`m`xgiB15xRRq`?0`MQ+8iIX|`JefqwSB$dvz&|fvG)?vkq@u@I z8(VZf=p?s5&Cqo*-;02CE@{*;JU+LEl*@LWOvL4+l{Gi%jwKI!s`pGF^kJ z5Dh9*eFqb3N4BRw&;Q8tzgUyMJ$hN3f*c$NcJPt^J@sh2z!DS9lr@{=zJl>B3`gqi z8{X8I{Fd7bDKV#<1`3aiO4ls+)HBnXJhwb-PZ!)INfiGK)9F@VkKAoj6B16mgH+_9 zbdMj6b;;M#Gp@SbFebBv$Ogc+x|yE zFwg)8mDhjw(H1vm*~g9^coftTFxmb#?oN65xhG>#d&3eEdo^x|g0)exh&v=L0nAgC zpFELA%YDKzY6{^$-6VhPyvoQ>Lk8cnm=3B()w@vAxS$HH=W4mY#E+1!Du2qxGd^#n zsASwKN>j|I@+P%6AVE!=jN3#wTP}HEP|T6Uh&iAFt3^_C_bM}@^I3M9MR?{bY}Z|q zK5z7)D?6JtZr#Y}4sX(lAH--wVQBH$QLHuo7B_kXny?7&SvflK8@+>Yni9fq$uwv` z$~D8Q-f)c?ayDA^#LVK1p$)1DKQG|YW}|bgWkQek`f^>(%snP@?f|_1g7$aJq4kT$ z&i%<66aWAVN(m@)e&z*LY#7z||hGQ)HH+1`f+1sg$*g}0Lf^!&Yr-HR_0+I0JO=dMOk zCV4OB%qZ4Iw%#VKc0^s_2@ zcQ^;6PJ1TG;x;yW0*=8Fn573c0pS{%F$xe2tf z$Z_Rf!S)A8O8~(;rn6b5+HgVGqd&-i!Y?MH|ShVS06!& zO!(Wqjc>fgMwbBv~gaw%DLr@uA5u%9_OP8>j6 z^f?(Jb0#?7+q@^V90MHv2dhf#qWfnpDPmu3w5au*7c zLoXv5ZAj@PW~SJZEnKkQi4JAAc!z?xln0)>CNArdja%=d*(`TFEVEVWRf;bnVAY8h z-_MiDFnm<=n9lG^C8dfO8uqM6LfJg2>l5|%V7DaFzy|!LC>w>9QBb=~*3N;l)~!~U z_u$5JZYA!HO74+x*=NahZh?)ArTRqyTL%=-58vAF52_@$-o5AJstO$gWHfnd~%!R|uy zx}knos966q5Hu5X8W%&?O&W**0LGtkc64&LHgWtJ0P@vstd`iZ?}X=lu&=?xE?X=p z4RX^n%KO9Ft>r4h5$~nn& z1Bp1)zl4{n2X~6#cf&vaT2xXr5pLo($kH(^;v-wWgmT_wI!f)U%+qVNmqtw8pu;8e zEyy;-KU0fThMZ+Ae13pkOVdXh6*pufMS2(?jAL4JLy0Xjc2bBU%S!s?G7b&wLQWu6 zld)2sz~jW|Ye^bu9=2$1Z8(!A9tVQZM{gHQ0_VW6_F)xPdNqwsAB|{1O@AuXjP&fXk>G0`9pxUL1M@$~fPJ(>s%X@h58MAqt$QdbusTFnN;KA*5d-j}| z_Jyr8v}h5Duhk;x?-#GhD6x_1#-%BzAG3J`Q{JTG+k2g3n>TOtygfte@zt#VND!z8 zv$v){mtjP(FVSz=f&)HiO5KRRO;msPc*|&pw_~;<;M4b^jtf~yqYnY~;MJZpf3`)U ztPTCf6SnDo8yuu~dtnr*WEPadp?5g6dFfY%ujG6A+OC9BU@22zRwjm3-E_H_->9T` zRYPzol{(O|2!w9IQLx{rFDwKxE8`i^!$;UrrHr1V&~J^pTDUSYeE75aAL^gIHI{PV)J%1$D79r_o0FgQfV9E;q7;#afYsJTr)>Pcf0L_ zgS^p>7xASjIcPMrJ2G=<;7bacp@?ydnnMpI`YqWdag`0@C}Jh)Ic>ZHFNdS8Xz(T+ zeryU=jM6p?*dtcuK&+n0j6Gt9JSz^~TiYJT2%*tXGZXC{iiZ&p!>dy>l0!i-VV!73 zSc_#pB0CItu(HHJ$lC)hwzvBp4cGX?9j>WcZ4g}^-E)9?LjPj0!}>wV})xZyWe3E zCaWT!3ibOxvD7c?za%C!0+>_vhSN)ztJZXJg>LGVG)agvsRqwc$iD5NLx&h(i|)_f z+C%JEqOg#k%2?@jNr2HUgr!073hHy70Ihc1?H%5=Hn+R8_j~v5Kl%yTvhLByCr$D> z)svQ~6_ZSwDevwE?{Cjs_dDWhfOV!rho%G^ni6V$o%lAIqr-ko zs4Hw|eUd5YR92S>38&gTi!+0vjTYsV^!>m%%3Hw|vji;{p+c^1I(>#yVo5LgO#0M` zVtnbK3F^phHY-9g`+Ae~t&c_Hsb7mjwtJ*{YPv?nykDf43_#op>(@q$!ZwjrHAQ<-es4?%Q>OaA*(ms4#edr`!Yi*&c`7yu;q$ zb9N|{Q5*1P8mE@vc)l}?AhLUoI(e~*ypnp5KRe5cF8^Ur$s<;}!ivB`=5X#jG~F!fu6&v4d* zB^^qkft)gP&srIeLuHwb$3Mj#Z8)}dB)NU&3#*?0cm>~P8w6JCGIUaQ>oeUmSw{&v z#@g(+NO+0sl;H1Kr}Wh#u)JMvP0qEgB50?RSJSBK>4z03U4QiEYINFj-i9%FE-QbU zH}omFk2P8la7J!}LRU2+=Z6^^s|am25E&xMJf0K|eGUJB#{%DZ<8Ju@3wYLq-znE# z%T`IISL$%S;T=>zhFuw_3cEC9yV=5OV=G5`bl*|8(3oV0mbT=-+mQXX-QO$5zp%(4 zyNyvxsPG5g(KiSSgOQo#*r!H-t z)UNFEE67#hX_l=tf@xpL37M}PeA6s?~3 zd!1wt9~~NVdf<;l#_z`PcD?RSNf59IzCG`m^GEw)Nu_kdS`t)}^7D}I^@=oe4sdeA zss42PxS!pS--E=tGF;jbP9rxs6{+3wy)SyKQ9{_b=xXrqd5Y=qRWPa}INzt2#a}%s z3`g!WUp_lMvFr7tqYpFQeqQC|Q@s76ecjt)yX$L1@{pmJY02TmRA_^f3mT)a905o_ z@^UGlo>t$lbv*ZqF0CffErPm_q+Ha~RgsR25D(@H=!>$Y6Hf`-y4 zNs4bxWq3g_ARWU*ZgAPgM!99=12KAwRnLLSQPok~w+`>h&q>VZgNoG_S{9Oa>0s;Y z4VNq2QbWS}*Cx#n570O3ICA(B$8@)g_D+fNl+oNm;-g)I%(|0T>5HCa$g-_Uj?pU( z>4DR@cCBXt%V$Ni0)2&5+5V8Y}|eOi`9or#OZ12O3p{_4ktH zO~dFJ6la{vZB0L8j$|$78yGP*E!NZ24IuP!};jt-!5K%_)_7gKo_ru(6S)WpMm~oF6wA*V&de;_Q%^F(LN)7C~Aue zTK#!^CO`$oT&>Z3tA*#*7uZ2eg*k@tgn zKw~VBoRw@^grRH~vbJ0;<3Z&Iw#f)w%BJGx8bnzSSS8A2EPVo`uMHgt$nYow$#obx z%tR--2xGUOT;AA!tX4enRc$oraP0cJ5d&n2+(MbwA@QJp1f~eKAz@q}3eQBu17RbD zBbBW+)|?Qg5nQZuyDsSxqLd0qloiBnaEmfd=DMvAeZADm0UO0RibY{+hYPjl6Mwej zv|!9QYe0ESTkAO1c)Vu~YHEtJJzEQsaoBys20?oFvfu+VBeA~bfdC7H*4iQHY;Gq! zvrO*VwQcyMMAmGQWqi~8`FN>0PpZ(%Ql|`4i>$usy&obiIy1QXO&XG9+;q2TkJP0W z95^ob8Uq9FvEx^xtgISu2bc@PV%Np|_9MmkFg=Szi{mN^_xBt9!l+>v5-&MACiD^E zd|Zvkgw%kyvCif{KCZtk&e;&a_nzS`H>*)$kx)H&5u7@kUxG9%WMMI=6ALRY|zFIQpGRe2w;dLnC` zh<+N8aA+MTCQI&T=||Hq>YbVy770B?dUL9QP&7f<>L6lG5c1QHSWYm@#8R|;a76@Q zbX9IMUove|Gv8f6C_|4X0}#$j!3h~K+f&XXUb{$CI*a|vsq_p7=V#UH-?t_$uXLus zB0E0H4E!Xcz+A1)Il-l&!Z0j9?prN)jMkM~D2pm{MMrnl%&sS8X((%n6KYnLACvS! zA9ibhX<m{!iG4Wo#Rca({8?C1)t3AN|Q@M{@%Q6Ju2;2Mb%XKQb~W zm2SmecJ%hgKd^wHyaN|e-$gKBGm3~3Tgx>vfu7D~?%UB@ZFNJ^c!wXJ-s2#-ns3$4 z_Fd9EmpbL~YB1NFYM+lB&+0vW+*CxZ-`OcZRjBuxpt_l60AU~Zc0cy+ME^VUeNfZ!+Os6zu}1Oc6tm`>1dJCF zU&!$SJp>2fC=}AW6pI$6P&k7(bPT+LUdA9lX@}>sk!Hrxz*9ro>!GToeLmNCr(-LK z!LSH0A4Vj&HRhw{HcRdz(MkYMp1|v(BWZ64QZIV-a@sj*iFwr8g^D$#K%VAhPKtzC zUGem@{V|sh2mM9phy8NsfD8ZcE`;-E!jKU!7oEmQ1*aYD{PSc{N|2p)a{4;R5)4JbBjfPsrVRN^PpnmR@VLK zqjajgKKE6iPAUPa2jWZ0BuWvMr~$AJ+&T)bmd3;Yr`VZ;3!DyEO%msT(iY<#pv?tRA0}Xo|{^$Fr=|?8&6CyGhWj*E;h~o>~Xq$Vq z7&T`e`1$GX=Zu*zH|XErPwnK!&2Mxt8Rk6QXBwt78cup|;O#BT*TT#5qwNL}H4teg zl;L2omfLjB?wy!Ms06R|ewdD%yYCa_lI-ZFrCSQ_2?-b%bvG;h8J=EIpX&T=%&drO zI_9-(T8a!*%MOjZQhsm5W*b2dv)67S{O-}-Cr8`>uOr^pYTlez@C_e<;mXp#R8DX;Z&9E*tYAxrvT&x=MD+HX0@R9~w(=Hsdp z>T*syH4d2bC*Hr0eqnez#xC!CUvQKgPoit3fbpFiHkPwz_;cIm^5zbx z8U|?C57NKHJ3C?yU1e zw~YLi3EtC5w~E5zK5;}qXd--TcMtO~auuKkd5h9C~9DJvL&Py)CaRpz$^ zgLHVq*ApBP4{is(7pijJ_#B(?zH{-`x%^R7a_LdAxn_MYlPX`Z%+$H7x1x+I!GM17 zrlGe&5IqyP6*K=+vyEct$s2&{zXIj~#<@-(EBTLh%D$+R))gWZa)|uM{wtq=qsen3(DwzWx3M4r&We9 zaV}=SLZh4pA!NsM&~Fezt=G#(qZwMC;Vb? z@on?{a42=cM4xZabDCl+$d7=9{CNYz=!7uI>=(16E3#G&&l*J0ERYV=IO@CN3frU9 zcgR$eFZ5a13FOu!9qheI``t z;Y6k?b=Qb)UtIHqimVHDR{R+IH2`TLC1bPjvEsRz{J7du`>1+i-P@=%5EoABQ^{vTzKhbRvd&R;0DB!51^!}RkZ!o%$87s3|l-`>X` zxzt0Hzb;UJVE_OF6ac`l$o=o->O+9P2K(OuTB&~r_*dwEX#Q8n{JXg#?QiD)a?{Fk U2vCv%0O-&c5|pQ8Pkw&;KX)E@asU7T literal 0 HcmV?d00001 diff --git a/test/output/.gitignore b/test/output/.gitignore new file mode 100644 index 0000000..16f2dc5 --- /dev/null +++ b/test/output/.gitignore @@ -0,0 +1 @@ +*.csv \ No newline at end of file diff --git a/test/test_load.py b/test/test_load.py new file mode 100644 index 0000000..b4d7a27 --- /dev/null +++ b/test/test_load.py @@ -0,0 +1,17 @@ +import hellocomputer +from hellocomputer.analytics import DDB +from pathlib import Path + +TEST_DATA_FOLDER = Path(hellocomputer.__file__).parents[2] / "test" / "data" +TEST_OUTPUT_FOLDER = Path(hellocomputer.__file__).parents[2] / "test" / "output" + + +def test_load_data(): + db = ( + DDB() + .load_metadata(TEST_DATA_FOLDER / "TestExcelHelloComputer.xlsx") + .dump_local(TEST_OUTPUT_FOLDER) + ) + + assert db.sheets == ("answers",) + assert (TEST_OUTPUT_FOLDER / "answers.csv").exists()