Added parquet functionality. Version bump
Some checks failed
ci/woodpecker/push/woodpecker Pipeline was successful
ci/woodpecker/tag/woodpecker Pipeline failed

This commit is contained in:
Guillem Borrell 2022-11-27 23:35:09 +00:00
parent c1d46c68a4
commit 1a2bb27141
4 changed files with 70 additions and 2 deletions

View file

@ -1,7 +1,7 @@
[package]
name = "dr"
description = "Command-line data file processing in Rust"
version = "0.2.1"
version = "0.3.0"
edition = "2021"
include = [
"**/*.rs",

View file

@ -53,6 +53,7 @@ Commands:
sql Runs a sql statement on the file
print Pretty prints the table
rpq Read parquet file
wpq Write to a parquet file
help Print this message or the help of the given subcommand(s)
Options:
@ -82,13 +83,41 @@ shape: (4, 14)
Note that when `dr` loads csv data also tries to guess the data type of each field.
### Parquet
`dr` is also useful to translate your csv files to parquet with a single command:
```bash
$ cat wine.csv | dr wpq wine.pq
```
Or explore parquet files
```bash
$ dr rpq wine.pq | head -n 5 | dr print
shape: (4, 14)
┌──────┬─────────┬────────────┬──────┬─────┬───────────┬──────┬──────┬─────────┐
│ Wine ┆ Alcohol ┆ Malic.acid ┆ Ash ┆ ... ┆ Color.int ┆ Hue ┆ OD ┆ Proline │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ i64 │
╞══════╪═════════╪════════════╪══════╪═════╪═══════════╪══════╪══════╪═════════╡
│ 1 ┆ 14.23 ┆ 1.71 ┆ 2.43 ┆ ... ┆ 5.64 ┆ 1.04 ┆ 3.92 ┆ 1065 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 13.2 ┆ 1.78 ┆ 2.14 ┆ ... ┆ 4.38 ┆ 1.05 ┆ 3.4 ┆ 1050 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 13.16 ┆ 2.36 ┆ 2.67 ┆ ... ┆ 5.68 ┆ 1.03 ┆ 3.17 ┆ 1185 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 14.37 ┆ 1.95 ┆ 2.5 ┆ ... ┆ 7.8 ┆ 0.86 ┆ 3.45 ┆ 1480 │
└──────┴─────────┴────────────┴──────┴─────┴───────────┴──────┴──────┴─────────┘
```
## Performance
`dr` is implemented in Rust with the goal of achieving the highest possible performance. Take for instance a simple read, groupby, and aggregate operation with ~30MB of data:
```bash
$ time cat data/walmart_train.csv | ./target/release/dr sql "select Dept, avg("Weekly_Sales") from this group by Dept" | ./target/release/dr print
$ time cat data/walmart_train.csv | dr sql "select Dept, avg("Weekly_Sales") from this group by Dept" | dr print
shape: (81, 2)
┌──────┬──────────────┐
│ Dept ┆ Weekly_Sales │

View file

@ -39,3 +39,30 @@ pub fn read_parquet(path: String) -> DataFrame {
};
df
}
/// Write a Polars DataFrame to Parquet
pub fn write_parquet(
mut df: DataFrame,
path: String,
compression: String,
statistics: bool,
chunksize: Option<usize>,
) {
// Selected compression not implemented yet
let mut _file = match fs::File::create(path) {
Ok(mut file) => {
let mut w = ParquetWriter::new(&mut file);
if statistics {
w = w.with_statistics(statistics);
}
if chunksize.unwrap_or(0) > 0 {
w = w.with_row_group_size(chunksize);
}
let _r = match w.finish(&mut df) {
Ok(_r) => (),
Err(e) => eprintln!("{e}"),
};
}
Err(e) => eprintln!("{e}"),
};
}

View file

@ -16,6 +16,11 @@ fn main() {
.about("Read parquet file")
.arg(arg!([path] "Path to the parquet file")),
)
.subcommand(
Command::new("wpq")
.about("Write to a paquet file")
.arg(arg!([path] "Path to the new parquet file")),
)
.get_matches();
if let Some(matches) = matches.subcommand_matches("sql") {
@ -40,6 +45,13 @@ fn main() {
} else {
eprintln!("File not found")
}
} else if let Some(matches) = matches.subcommand_matches("wpq") {
if let Some(path) = matches.get_one::<String>("path") {
let df = io::load_csv_from_stdin();
io::write_parquet(df, path.to_string(), "lz4raw".to_string(), true, Some(0));
} else {
eprintln!("Could now write to parquet");
}
} else {
println!("No command provided. Please execute dr --help")
}