Implemented functionalities for 0.4
Some checks failed
ci/woodpecker/push/woodpecker Pipeline was successful
ci/woodpecker/tag/woodpecker Pipeline failed

This commit is contained in:
Guillem Borrell 2022-12-06 14:16:39 +00:00
parent 978afec7f4
commit 79a8a82bea
6 changed files with 284 additions and 196 deletions

View file

@ -1,7 +1,7 @@
[package] [package]
name = "dr" name = "dr"
description = "Command-line data file processing in Rust" description = "Command-line data file processing in Rust"
version = "0.3.1" version = "0.4"
edition = "2021" edition = "2021"
include = [ include = [
"**/*.rs", "**/*.rs",
@ -14,7 +14,4 @@ repository = "https://git.guillemborrell.es/guillem/dr"
[dependencies] [dependencies]
clap = {version = "4.0", features = ["cargo"]} clap = {version = "4.0", features = ["cargo"]}
polars = "0.25" polars = {"version" = "0.25", features = ["sql", "lazy", "parquet", "decompress", "ipc", "ipc_streaming", "docs-selection"]}
polars-sql = "0.2.1"
polars-lazy = "0.25"
polars-io = {"version" = "0.25", features = ["parquet"]}

201
README.md
View file

@ -9,51 +9,21 @@ You may wonder why I'm implementing this, since there's already [xsv](https://gi
1. This what I'm implementing to learn Rust 1. This what I'm implementing to learn Rust
2. The Rust data ecosystem has evolved immensely since xsv was sarted. Now we can add things like SQL commands to filter csv files, or translate results to parquet files. 2. The Rust data ecosystem has evolved immensely since xsv was sarted. Now we can add things like SQL commands to filter csv files, or translate results to parquet files.
## Example
```bash ## Help
$ head wine.csv
Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050
1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185
1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480
1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735
1,14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450
1,14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290
1,14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295
1,14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045
$ cat wine.csv | dr sql "select Wine, avg(Alcohol) from this group by Wine" | dr print
shape: (3, 2)
┌──────┬───────────┐
│ Wine ┆ Alcohol │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞══════╪═══════════╡
│ 3 ┆ 13.15375 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 13.744746 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 12.278732 │
└──────┴───────────┘
``` ```
## Howto
The `dr` command offers a set of subcommands, each one of them with a different functionality. You can get the available subcommands with:
```bash
$ dr --help $ dr --help
Command-line data file processing in Rust Command-line data file processing in Rust
Usage: dr [COMMAND] Usage: dr [COMMAND]
Commands: Commands:
csv Read csv, output arrow stream
sql Runs a sql statement on the file sql Runs a sql statement on the file
print Pretty prints the table print Pretty prints the table
rpq Read parquet file rpq Read parquet file
wpq Write to a parquet file wpq Write to a paquet file
help Print this message or the help of the given subcommand(s) help Print this message or the help of the given subcommand(s)
Options: Options:
@ -61,11 +31,21 @@ Options:
-V, --version Print version information -V, --version Print version information
``` ```
Subcommands can be pipelined unless reading from a file, writing to a file, or pretty prints data. What goes through the pipeline is a plain-text comma separated values with a header. While this may not be the best choice in terms of performance, allows `dr` subcommands to be combined with the usual unix-style command-line tools like `cat`, `head`, `grep`, `awk` and `sed`: ## Howto
```bash `dr` is convenience command to explore, transform, and analyze csv and parquet files to save you from writing throwaway python scripts or create a custom container image for verys simple tasks. It's designed to make the life of a data engineer a little easier.
$ cat wine.csv | head -n 5 | dr print
shape: (4, 14) Assume you have a very large csv file, and you just want to translate it to parquet with some type inference and sane defaults. With `dr` this is as easy as:
```
$ dr csv wine.csv -P wine.pq
```
Parquet files are binary, and you may want to check that you've not written nonsense by printing the header on your terminal.
```
$ dr rpq wine.pq -a
shape: (5, 14)
┌──────┬─────────┬────────────┬──────┬─────┬───────────┬──────┬──────┬─────────┐ ┌──────┬─────────┬────────────┬──────┬─────┬───────────┬──────┬──────┬─────────┐
│ Wine ┆ Alcohol ┆ Malic.acid ┆ Ash ┆ ... ┆ Color.int ┆ Hue ┆ OD ┆ Proline │ │ Wine ┆ Alcohol ┆ Malic.acid ┆ Ash ┆ ... ┆ Color.int ┆ Hue ┆ OD ┆ Proline │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
@ -78,112 +58,91 @@ shape: (4, 14)
│ 1 ┆ 13.16 ┆ 2.36 ┆ 2.67 ┆ ... ┆ 5.68 ┆ 1.03 ┆ 3.17 ┆ 1185 │ │ 1 ┆ 13.16 ┆ 2.36 ┆ 2.67 ┆ ... ┆ 5.68 ┆ 1.03 ┆ 3.17 ┆ 1185 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 14.37 ┆ 1.95 ┆ 2.5 ┆ ... ┆ 7.8 ┆ 0.86 ┆ 3.45 ┆ 1480 │ │ 1 ┆ 14.37 ┆ 1.95 ┆ 2.5 ┆ ... ┆ 7.8 ┆ 0.86 ┆ 3.45 ┆ 1480 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 13.24 ┆ 2.59 ┆ 2.87 ┆ ... ┆ 4.32 ┆ 1.04 ┆ 2.93 ┆ 735 │
└──────┴─────────┴────────────┴──────┴─────┴───────────┴──────┴──────┴─────────┘ └──────┴─────────┴────────────┴──────┴─────┴───────────┴──────┴──────┴─────────┘
``` ```
Note that when `dr` loads csv data also tries to guess the data type of each field. Maybe the most interesing feature of `dr` is the ability to process csv and parquet files using SQL, while solutions like `xsv` and `csvkit` rely on a rich set of subcommands and options. If you already know SQL, there's no need to read any more documentation to select, filter, or group data. The only thing you need to remember is that the table will be called `this`. The following command outputs a csv of the wine with the highest concentration of alcohol in the popular wine dataset:
### Parquet
`dr` is also useful to translate your csv files to parquet with a single command:
```bash
$ cat wine.csv | dr wpq wine.pq
``` ```
dr csv wine.csv -q "select * from this where Alcohol = max(Alcohol)" | dr print
Or explore parquet files shape: (1, 14)
```bash
$ dr rpq wine.pq | head -n 5 | dr print
shape: (4, 14)
┌──────┬─────────┬────────────┬──────┬─────┬───────────┬──────┬──────┬─────────┐ ┌──────┬─────────┬────────────┬──────┬─────┬───────────┬──────┬──────┬─────────┐
│ Wine ┆ Alcohol ┆ Malic.acid ┆ Ash ┆ ... ┆ Color.int ┆ Hue ┆ OD ┆ Proline │ │ Wine ┆ Alcohol ┆ Malic.acid ┆ Ash ┆ ... ┆ Color.int ┆ Hue ┆ OD ┆ Proline │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ i64 │ │ i64 ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ i64 │
╞══════╪═════════╪════════════╪══════╪═════╪═══════════╪══════╪══════╪═════════╡ ╞══════╪═════════╪════════════╪══════╪═════╪═══════════╪══════╪══════╪═════════╡
│ 1 ┆ 14.23 ┆ 1.71 ┆ 2.43 ┆ ... ┆ 5.64 ┆ 1.04 ┆ 3.92 ┆ 1065 │ │ 1 ┆ 14.83 ┆ 1.64 ┆ 2.17 ┆ ... ┆ 5.2 ┆ 1.08 ┆ 2.85 ┆ 1045 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 13.2 ┆ 1.78 ┆ 2.14 ┆ ... ┆ 4.38 ┆ 1.05 ┆ 3.4 ┆ 1050 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 13.16 ┆ 2.36 ┆ 2.67 ┆ ... ┆ 5.68 ┆ 1.03 ┆ 3.17 ┆ 1185 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 14.37 ┆ 1.95 ┆ 2.5 ┆ ... ┆ 7.8 ┆ 0.86 ┆ 3.45 ┆ 1480 │
└──────┴─────────┴────────────┴──────┴─────┴───────────┴──────┴──────┴─────────┘ └──────┴─────────┴────────────┴──────┴─────┴───────────┴──────┴──────┴─────────┘
``` ```
If you don't use any option that formats the output of the results, `dr` outputs Arrow's IPC format, meaning that multiple `dr` calls can be efficiently chained with very low overhead. The following script loads one month of NY taxi data and executes two sql queries on the data.
```
$ dr rpq data/yellow_tripdata_2014-01.parquet \
-q "select count(1) as cnt, passenger_count from this group by passenger_count" \
| dr sql "select * from this order by cnt desc" \
| dr print
┌─────────┬─────────────────┐
│ cnt ┆ passenger_count │
│ --- ┆ --- │
│ u32 ┆ i64 │
╞═════════╪═════════════════╡
│ 9727321 ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1891588 ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 789070 ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 566248 ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 19 ┆ 208 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 16 ┆ 9 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ 7 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 8 │
└─────────┴─────────────────┘
```
## Reference
Some commands that generate raw output in ipc format.
* Read a csv or parquet file and print the header: `dr {csv, rpq} [file] -a`
* Read a csv or parquet file, execute a SQL statement, and output the results in stdout using Arrow's ipc format `dr {csv, rpq} [file] -q "statement"`
* Read a csv or parquet file and print a summary of each column: `dr {csv, rpq} [file] -s "[query]"`
* Read a csv or parquet file, execute a query, and output the results in stdout using the csv format `dr {csv, rpq} [file] -s "[query]" -t`
* Read a csv and write a parquet file with the same contents: `dr csv [file.csv] -P [file.pq]`
Some commands that convert raw input in ipc format
* Read from stdin in ipc and pretty print the table: `dr print`
* Read from stdin in csv and pretty print the table: `dr print -t`
* Read from stdin in ipc and write the data in parquet: `dr wpq [file.pq]`
## Performance ## Performance
`dr` is implemented in Rust with the goal of achieving the highest possible performance. Take for instance a simple read, groupby, and aggregate operation with ~30MB of data: This command runs two dr processes. The first one makes an aggregation on a compressed parquet file of 144MB of size, and the second one just orders the result:
```bash ```
$ time cat data/walmart_train.csv | dr sql "select Dept, avg("Weekly_Sales") from this group by Dept" | dr print $ dr rpq data/yellow_tripdata_2014-01.parquet \
shape: (81, 2) -q "select count(1) as cnt, passenger_count from this group by passenger_count" \
┌──────┬──────────────┐ | dr sql "select * from this order by cnt desc" \
│ Dept ┆ Weekly_Sales │ > /dev/null
│ --- ┆ --- │
│ i64 ┆ f64 │
╞══════╪══════════════╡
│ 30 ┆ 4118.197208 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 16 ┆ 14245.63827 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 56 ┆ 3833.706211 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 24 ┆ 6353.604562 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 31 ┆ 2339.440287 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 59 ┆ 694.463564 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 27 ┆ 1583.437727 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 77 ┆ 328.9618 │
└──────┴──────────────┘
real 0m0.089s
user 0m0.116s
sys 0m0.036s
``` ```
Let's compare that with the followint Python script that leverages Pandas to read the data, and compute the aggregation: On a very very old machine (Intel(R) Core(TM) i5-6500T CPU @ 2.50GHz), this takes around half a second, which is roughly the time needed to read and decompress the parquet file. Polar's csv and parquet readers have some decent performance, so you can count on `dr` to be one of the fastest in the block.
```python ## Caveats
#!/usr/bin/env python3
import sys 1. `dr` uses Polars to build and transform dataframes in Rust, and the entire table has to be loaded in memory. At the time when `dr` was created, streaming support didn't get along very well with SQL contexts.
import pandas as pd
df = pd.read_csv(sys.stdin)
print(df.groupby("Dept", sort=False, as_index=False).Weekly_Sales.mean())
```
```bash
$ time cat data/walmart_train.csv | ./python/group.py
Dept Weekly_Sales
0 1 19213.485088
1 2 43607.020113
2 3 11793.698516
3 4 25974.630238
4 5 21365.583515
.. ... ...
76 99 415.487065
77 39 11.123750
78 50 2658.897010
79 43 1.193333
80 65 45441.706224
[81 rows x 2 columns]
real 0m0.717s
user 0m0.627s
sys 0m0.282s
```
Note that there's roughly a 6x speedup. This considering that this operation in particular is heavily optimized in Pandas and most of the run time is spent in parsing and reading from stdin.
2. `dr` uses Polars' SQLContext to execute the query which supports a small subset of the SQL language.
## Built standing on the shoulders of giants ## Built standing on the shoulders of giants

View file

@ -1,7 +0,0 @@
#!/usr/bin/env python3
import sys
import pandas as pd
df = pd.read_csv(sys.stdin)
print(df.groupby("Dept", sort=False, as_index=False).Weekly_Sales.mean())

View file

@ -1,22 +1,53 @@
use polars::frame::DataFrame;
use polars::prelude::*; use polars::prelude::*;
use std::fs; use std::fs;
use std::io; use std::io;
use std::io::Read; use std::io::Read;
/// Read CSV format from stdin and return a Polars DataFrame /// Read CSV file
pub fn load_csv_from_stdin() -> DataFrame { pub fn read_csv(path: String) -> LazyFrame {
let mut buffer = String::new(); LazyCsvReader::new(path)
let _res: () = match io::stdin().read_to_string(&mut buffer) { .finish()
.expect("Could not load file")
}
/// Read parquet and return a Polars LazyFrame
pub fn read_parquet(path: String) -> LazyFrame {
LazyFrame::scan_parquet(path, ScanArgsParquet::default()).expect("Could not read parquet file")
}
/// Read IPC setream
pub fn read_ipc() -> LazyFrame {
let mut buffer = Vec::new();
let _res: () = match io::stdin().lock().read_to_end(&mut buffer) {
Ok(_ok) => (), Ok(_ok) => (),
Err(_e) => (), Err(_e) => (),
}; };
let cursor = io::Cursor::new(buffer.as_bytes()); let cursor = io::Cursor::new(buffer);
let df = match CsvReader::new(cursor).finish() { match IpcStreamReader::new(cursor).finish() {
Ok(df) => df, Ok(df) => df.lazy(),
Err(_e) => DataFrame::default(), Err(_e) => LazyFrame::default(),
}
}
/// Read CSV format from stdin and return a Polars DataFrame
pub fn load_csv_from_stdin() -> LazyFrame {
let mut buffer = Vec::new();
let _res: () = match io::stdin().lock().read_to_end(&mut buffer) {
Ok(_ok) => (),
Err(_e) => (),
}; };
df let cursor = io::Cursor::new(buffer);
match CsvReader::new(cursor).finish() {
Ok(df) => df.lazy(),
Err(_e) => LazyFrame::default(),
}
}
/// Write to IPC steram
pub fn write_ipc(df: LazyFrame) {
IpcStreamWriter::new(io::stdout().lock())
.finish(&mut df.collect().expect("Could not collect dataframe"))
.expect("Could not write to stream");
} }
/// Take a Polars Dataframe and write it as CSV to stdout /// Take a Polars Dataframe and write it as CSV to stdout
@ -27,19 +58,6 @@ pub fn dump_csv_to_stdout(df: &mut DataFrame) {
}; };
} }
/// Read parquet and return a Polars DataFrame
pub fn read_parquet(path: String) -> DataFrame {
let file = fs::File::open(path).expect("Could not open file");
let df = match ParquetReader::new(file).finish() {
Ok(df) => df,
Err(e) => {
eprintln!("{e}");
DataFrame::default()
}
};
df
}
/// Write a Polars DataFrame to Parquet /// Write a Polars DataFrame to Parquet
pub fn write_parquet( pub fn write_parquet(
mut df: DataFrame, mut df: DataFrame,

View file

@ -1,54 +1,179 @@
mod io; mod io;
mod sql; mod sql;
use clap::{arg, command, Command}; use clap::{arg, command, ArgAction, Command};
fn main() { fn main() {
let matches = command!() let matches = command!()
.subcommand(
Command::new("csv")
.about("Read csv, output arrow stream")
.arg(arg!([path] "Path to CSV file"))
.arg(arg!(-d --delimiter <String> "Column delimiter. Assume ,").required(false))
.arg(arg!(-q --query <String> "Execute query on the file").required(false))
.arg(
arg!(-s --summary ... "Summarize the data")
.required(false)
.action(ArgAction::SetTrue),
)
.arg(
arg!(-t --text ... "Output text instead of binary")
.required(false)
.action(ArgAction::SetTrue),
)
.arg(arg!(-P --parquet <String> "Write output as a parquet file").required(false))
.arg(
arg!(-a --head ... "Print the header of the table")
.required(false)
.action(ArgAction::SetTrue),
),
)
.subcommand( .subcommand(
Command::new("sql") Command::new("sql")
.about("Runs a sql statement on the file") .about("Runs a sql statement on the file")
.arg(arg!([statement] "SQL statement")) .arg(arg!([statement] "SQL statement"))
.arg(
arg!(-t --text ... "Input text instead of binary")
.required(false)
.action(ArgAction::SetTrue),
)
.arg(arg!(-d --delimiter <String> "Column delimiter").required(false)), .arg(arg!(-d --delimiter <String> "Column delimiter").required(false)),
) )
.subcommand(Command::new("print").about("Pretty prints the table")) .subcommand(
Command::new("print").about("Pretty prints the table").arg(
arg!(-t --text ... "Inputs csv instead of binary")
.required(false)
.action(ArgAction::SetTrue),
),
)
.subcommand( .subcommand(
Command::new("rpq") Command::new("rpq")
.about("Read parquet file") .about("Read parquet file")
.arg(arg!([path] "Path to the parquet file")), .arg(arg!([path] "Path to the parquet file"))
.arg(arg!(-q --query <String> "Execute query on the file").required(false))
.arg(
arg!(-s --summary ... "Summarize the data")
.required(false)
.action(ArgAction::SetTrue),
)
.arg(
arg!(-t --text ... "Output text instead of binary")
.required(false)
.action(ArgAction::SetTrue),
)
.arg(arg!(-P --text <String> "Write the result as a parquet file").required(false))
.arg(
arg!(-a --head ... "Print the header of the table")
.required(false)
.action(ArgAction::SetTrue),
),
) )
.subcommand( .subcommand(
Command::new("wpq") Command::new("wpq")
.about("Write to a paquet file") .about("Write to a paquet file")
.arg(
arg!(-t --text ... "Output text instead of binary")
.required(false)
.action(ArgAction::SetTrue),
)
.arg(arg!([path] "Path to the new parquet file")), .arg(arg!([path] "Path to the new parquet file")),
) )
.get_matches(); .get_matches();
if let Some(matches) = matches.subcommand_matches("sql") { if let Some(_matches) = matches.subcommand_matches("csv") {
//if let Some(delimiter) = matches.get_one::<String>("delimiter") { if let Some(path) = _matches.get_one::<String>("path") {
// println!("DEBUG: Delimiter: {delimiter}") let mut ldf = io::read_csv(path.to_string());
//} else { if let Some(query) = _matches.get_one::<String>("query") {
// println!("DEBUG: No delimiter") ldf = sql::execute(ldf, query);
//} }
if let Some(statement) = matches.get_one::<String>("statement") { if _matches.get_flag("summary") {
sql::execute(statement); let df = ldf.collect().expect("Could not collect");
println!("{}", df.describe(None));
} else if _matches.get_flag("head") {
let df = ldf.fetch(5).expect("Could not fetch");
println!("{}", df)
} else { } else {
let mut df = io::load_csv_from_stdin(); if _matches.get_flag("text") {
io::dump_csv_to_stdout(&mut df); io::dump_csv_to_stdout(&mut ldf.collect().expect("Could not collect"));
} else {
if let Some(path) = _matches.get_one::<String>("parquet") {
io::write_parquet(
ldf.collect().expect("Could not collect"),
path.to_string(),
"lz4raw".to_string(),
true,
Some(0),
);
} else {
io::write_ipc(ldf);
}
}
}
}
} else if let Some(_matches) = matches.subcommand_matches("sql") {
if let Some(statement) = _matches.get_one::<String>("statement") {
let ldf = if _matches.get_flag("text") {
io::load_csv_from_stdin()
} else {
io::read_ipc()
};
let res = sql::execute(ldf, statement);
io::write_ipc(res);
} else {
io::write_ipc(io::read_ipc());
} }
} else if let Some(_matches) = matches.subcommand_matches("print") { } else if let Some(_matches) = matches.subcommand_matches("print") {
let df = io::load_csv_from_stdin(); let df = if _matches.get_flag("text") {
io::load_csv_from_stdin()
} else {
io::read_ipc()
};
println!("{}", df.collect().expect("Could not collect"));
} else if let Some(_matches) = matches.subcommand_matches("rpq") {
if let Some(path) = _matches.get_one::<String>("path") {
let mut ldf = io::read_parquet(path.to_string());
if let Some(query) = _matches.get_one::<String>("query") {
ldf = sql::execute(ldf, query);
}
if _matches.get_flag("summary") {
let df = ldf.collect().expect("Could not collect");
println!("{}", df.describe(None));
} else if _matches.get_flag("head") {
let df = ldf.fetch(5).expect("Could not fetch");
println!("{}", df) println!("{}", df)
} else if let Some(matches) = matches.subcommand_matches("rpq") { } else {
if let Some(path) = matches.get_one::<String>("path") { if _matches.get_flag("text") {
let mut df = io::read_parquet(path.to_string()); io::dump_csv_to_stdout(&mut ldf.collect().expect("Could not collect"));
io::dump_csv_to_stdout(&mut df); } else {
if let Some(path) = _matches.get_one::<String>("parquet") {
io::write_parquet(
ldf.collect().expect("Could not collect"),
path.to_string(),
"lz4raw".to_string(),
true,
Some(0),
);
} else {
io::write_ipc(ldf);
}
}
}
} else { } else {
eprintln!("File not found") eprintln!("File not found")
} }
} else if let Some(matches) = matches.subcommand_matches("wpq") { } else if let Some(_matches) = matches.subcommand_matches("wpq") {
if let Some(path) = matches.get_one::<String>("path") { if let Some(path) = _matches.get_one::<String>("path") {
let df = io::load_csv_from_stdin(); let df = if _matches.get_flag("text") {
io::write_parquet(df, path.to_string(), "lz4raw".to_string(), true, Some(0)); io::load_csv_from_stdin()
} else {
io::read_ipc()
};
io::write_parquet(
df.collect().expect("Could not collect"),
path.to_string(),
"lz4raw".to_string(),
true,
Some(0),
);
} else { } else {
eprintln!("Could now write to parquet"); eprintln!("Could now write to parquet");
} }

View file

@ -1,19 +1,15 @@
use crate::io::dump_csv_to_stdout; use polars::prelude::LazyFrame;
use crate::io::load_csv_from_stdin; use polars::sql::SQLContext;
use polars_lazy::frame::IntoLazy;
use polars_sql::SQLContext;
pub fn execute(statement: &String) { pub fn execute(ldf: LazyFrame, statement: &String) -> LazyFrame {
if let Ok(mut context) = SQLContext::try_new() { let mut context = SQLContext::try_new().expect("Could not create context");
let df = load_csv_from_stdin(); context.register("this", ldf);
context.register("this", df.lazy());
if let Ok(res) = context.execute(statement) { match context.execute(statement) {
if let Ok(mut res) = res.collect() { Ok(res) => res,
dump_csv_to_stdout(&mut res); Err(e) => {
}; eprintln!("Query execution error {e}");
}; LazyFrame::default()
if let Err(e) = context.execute(statement) { }
eprintln!("Query execution error {e}") }
};
};
} }