From 1a2bb27141e2eab28ecec6b468ec882a98b12ff3 Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Sun, 27 Nov 2022 23:35:09 +0000 Subject: [PATCH] Added parquet functionality. Version bump --- Cargo.toml | 2 +- README.md | 31 ++++++++++++++++++++++++++++++- src/io.rs | 27 +++++++++++++++++++++++++++ src/main.rs | 12 ++++++++++++ 4 files changed, 70 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0488768..f62134e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "dr" description = "Command-line data file processing in Rust" -version = "0.2.1" +version = "0.3.0" edition = "2021" include = [ "**/*.rs", diff --git a/README.md b/README.md index df0936e..e70f8d9 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,7 @@ Commands: sql Runs a sql statement on the file print Pretty prints the table rpq Read parquet file + wpq Write to a parquet file help Print this message or the help of the given subcommand(s) Options: @@ -82,13 +83,41 @@ shape: (4, 14) Note that when `dr` loads csv data also tries to guess the data type of each field. +### Parquet + +`dr` is also useful to translate your csv files to parquet with a single command: + +```bash +$ cat wine.csv | dr wpq wine.pq +``` + +Or explore parquet files + +```bash +$ dr rpq wine.pq | head -n 5 | dr print +shape: (4, 14) +┌──────┬─────────┬────────────┬──────┬─────┬───────────┬──────┬──────┬─────────┐ +│ Wine ┆ Alcohol ┆ Malic.acid ┆ Ash ┆ ... ┆ Color.int ┆ Hue ┆ OD ┆ Proline │ +│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ i64 │ +╞══════╪═════════╪════════════╪══════╪═════╪═══════════╪══════╪══════╪═════════╡ +│ 1 ┆ 14.23 ┆ 1.71 ┆ 2.43 ┆ ... ┆ 5.64 ┆ 1.04 ┆ 3.92 ┆ 1065 │ +├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ +│ 1 ┆ 13.2 ┆ 1.78 ┆ 2.14 ┆ ... ┆ 4.38 ┆ 1.05 ┆ 3.4 ┆ 1050 │ +├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ +│ 1 ┆ 13.16 ┆ 2.36 ┆ 2.67 ┆ ... ┆ 5.68 ┆ 1.03 ┆ 3.17 ┆ 1185 │ +├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ +│ 1 ┆ 14.37 ┆ 1.95 ┆ 2.5 ┆ ... ┆ 7.8 ┆ 0.86 ┆ 3.45 ┆ 1480 │ +└──────┴─────────┴────────────┴──────┴─────┴───────────┴──────┴──────┴─────────┘ +``` + ## Performance `dr` is implemented in Rust with the goal of achieving the highest possible performance. Take for instance a simple read, groupby, and aggregate operation with ~30MB of data: ```bash -$ time cat data/walmart_train.csv | ./target/release/dr sql "select Dept, avg("Weekly_Sales") from this group by Dept" | ./target/release/dr print +$ time cat data/walmart_train.csv | dr sql "select Dept, avg("Weekly_Sales") from this group by Dept" | dr print shape: (81, 2) ┌──────┬──────────────┐ │ Dept ┆ Weekly_Sales │ diff --git a/src/io.rs b/src/io.rs index 359a786..df4972a 100644 --- a/src/io.rs +++ b/src/io.rs @@ -39,3 +39,30 @@ pub fn read_parquet(path: String) -> DataFrame { }; df } + +/// Write a Polars DataFrame to Parquet +pub fn write_parquet( + mut df: DataFrame, + path: String, + compression: String, + statistics: bool, + chunksize: Option, +) { + // Selected compression not implemented yet + let mut _file = match fs::File::create(path) { + Ok(mut file) => { + let mut w = ParquetWriter::new(&mut file); + if statistics { + w = w.with_statistics(statistics); + } + if chunksize.unwrap_or(0) > 0 { + w = w.with_row_group_size(chunksize); + } + let _r = match w.finish(&mut df) { + Ok(_r) => (), + Err(e) => eprintln!("{e}"), + }; + } + Err(e) => eprintln!("{e}"), + }; +} diff --git a/src/main.rs b/src/main.rs index 753d10a..4b43185 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,6 +16,11 @@ fn main() { .about("Read parquet file") .arg(arg!([path] "Path to the parquet file")), ) + .subcommand( + Command::new("wpq") + .about("Write to a paquet file") + .arg(arg!([path] "Path to the new parquet file")), + ) .get_matches(); if let Some(matches) = matches.subcommand_matches("sql") { @@ -40,6 +45,13 @@ fn main() { } else { eprintln!("File not found") } + } else if let Some(matches) = matches.subcommand_matches("wpq") { + if let Some(path) = matches.get_one::("path") { + let df = io::load_csv_from_stdin(); + io::write_parquet(df, path.to_string(), "lz4raw".to_string(), true, Some(0)); + } else { + eprintln!("Could now write to parquet"); + } } else { println!("No command provided. Please execute dr --help") }