diff --git a/.gitignore b/.gitignore index 193d30e..dabe183 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,8 @@ Cargo.lock # Added by cargo /target + +.vscode +.ipynb_checkpoints + +/data diff --git a/.woodpecker.yml b/.woodpecker.yml new file mode 100644 index 0000000..cca7a5a --- /dev/null +++ b/.woodpecker.yml @@ -0,0 +1,5 @@ +pipeline: + build: + image: rust:1-buster + commands: + - cargo install --path . \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index f622994..f89d5e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "dr" description = "Command-line data file processing in Rust" -version = "0.1.0" +version = "0.2.0" edition = "2021" include = [ "**/*.rs", diff --git a/README.md b/README.md index 2071188..526cc13 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # dr.rs -A set of data files (mostly csv and parquet) processing utilities inspired by [csvkit](https://github.com/wireservice/csvkit) with blazing speed, powered by Rust. +A toolkit to process data files (csv and parquet) using the command line, inspired by [csvkit](https://github.com/wireservice/csvkit), with blazing speed, and powered by Rust. You may wonder why I'm implementing this, since there's already [xsv](https://github.com/BurntSushi/xsv). There are two reasons for that: @@ -37,6 +37,10 @@ shape: (3, 2) └──────┴───────────┘ ``` +## Performance + + + ## Built standing on the shoulders of giants None of this would be possible without [Polars](https://github.com/pola-rs/polars) \ No newline at end of file diff --git a/python/group.py b/python/group.py new file mode 100755 index 0000000..64aeaa5 --- /dev/null +++ b/python/group.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +import sys +import pandas as pd + +df = pd.read_csv(sys.stdin) +print(df.groupby("Dept", as_index=False).Weekly_Sales.mean()) \ No newline at end of file diff --git a/queries/weekly_sales_by_dept.sql b/queries/weekly_sales_by_dept.sql new file mode 100644 index 0000000..0814f8c --- /dev/null +++ b/queries/weekly_sales_by_dept.sql @@ -0,0 +1,8 @@ +select + Dept, + avg(Weekly_Sales) +from + this +group by + Dept + \ No newline at end of file diff --git a/src/io.rs b/src/io.rs index 1c8fb5f..359a786 100644 --- a/src/io.rs +++ b/src/io.rs @@ -1,5 +1,6 @@ use polars::frame::DataFrame; use polars::prelude::*; +use std::fs; use std::io; use std::io::Read; @@ -26,15 +27,10 @@ pub fn dump_csv_to_stdout(df: &mut DataFrame) { }; } -/// Read parquet format from stdin and return a Polars DataFrame -pub fn load_parquet_from_stdin() -> DataFrame { - let mut buffer: String = String::new(); - let _res: () = match io::stdin().read_to_string(&mut buffer) { - Ok(_ok) => (), - Err(_e) => (), - }; - let cursor = io::Cursor::new(buffer.as_bytes()); - let df = match ParquetReader::new(cursor).finish() { +/// Read parquet and return a Polars DataFrame +pub fn read_parquet(path: String) -> DataFrame { + let file = fs::File::open(path).expect("Could not open file"); + let df = match ParquetReader::new(file).finish() { Ok(df) => df, Err(e) => { eprintln!("{e}"); diff --git a/src/main.rs b/src/main.rs index de6ee3b..753d10a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,7 +11,11 @@ fn main() { .arg(arg!(-d --delimiter "Column delimiter").required(false)), ) .subcommand(Command::new("print").about("Pretty prints the table")) - .subcommand(Command::new("rpq").about("Read parquet file")) + .subcommand( + Command::new("rpq") + .about("Read parquet file") + .arg(arg!([path] "Path to the parquet file")), + ) .get_matches(); if let Some(matches) = matches.subcommand_matches("sql") { @@ -26,15 +30,17 @@ fn main() { let mut df = io::load_csv_from_stdin(); io::dump_csv_to_stdout(&mut df); } - } - - if let Some(_matches) = matches.subcommand_matches("print") { + } else if let Some(_matches) = matches.subcommand_matches("print") { let df = io::load_csv_from_stdin(); println!("{}", df) - } - - if let Some(_matches) = matches.subcommand_matches("rpq") { - let mut df = io::load_parquet_from_stdin(); - io::dump_csv_to_stdout(&mut df); + } else if let Some(matches) = matches.subcommand_matches("rpq") { + if let Some(path) = matches.get_one::("path") { + let mut df = io::read_parquet(path.to_string()); + io::dump_csv_to_stdout(&mut df); + } else { + eprintln!("File not found") + } + } else { + println!("No command provided. Please execute dr --help") } }