Added parquet functionality. Version bump
This commit is contained in:
parent
c1d46c68a4
commit
1a2bb27141
|
@ -1,7 +1,7 @@
|
|||
[package]
|
||||
name = "dr"
|
||||
description = "Command-line data file processing in Rust"
|
||||
version = "0.2.1"
|
||||
version = "0.3.0"
|
||||
edition = "2021"
|
||||
include = [
|
||||
"**/*.rs",
|
||||
|
|
31
README.md
31
README.md
|
@ -53,6 +53,7 @@ Commands:
|
|||
sql Runs a sql statement on the file
|
||||
print Pretty prints the table
|
||||
rpq Read parquet file
|
||||
wpq Write to a parquet file
|
||||
help Print this message or the help of the given subcommand(s)
|
||||
|
||||
Options:
|
||||
|
@ -82,13 +83,41 @@ shape: (4, 14)
|
|||
|
||||
Note that when `dr` loads csv data also tries to guess the data type of each field.
|
||||
|
||||
### Parquet
|
||||
|
||||
`dr` is also useful to translate your csv files to parquet with a single command:
|
||||
|
||||
```bash
|
||||
$ cat wine.csv | dr wpq wine.pq
|
||||
```
|
||||
|
||||
Or explore parquet files
|
||||
|
||||
```bash
|
||||
$ dr rpq wine.pq | head -n 5 | dr print
|
||||
shape: (4, 14)
|
||||
┌──────┬─────────┬────────────┬──────┬─────┬───────────┬──────┬──────┬─────────┐
|
||||
│ Wine ┆ Alcohol ┆ Malic.acid ┆ Ash ┆ ... ┆ Color.int ┆ Hue ┆ OD ┆ Proline │
|
||||
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
|
||||
│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ i64 │
|
||||
╞══════╪═════════╪════════════╪══════╪═════╪═══════════╪══════╪══════╪═════════╡
|
||||
│ 1 ┆ 14.23 ┆ 1.71 ┆ 2.43 ┆ ... ┆ 5.64 ┆ 1.04 ┆ 3.92 ┆ 1065 │
|
||||
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
||||
│ 1 ┆ 13.2 ┆ 1.78 ┆ 2.14 ┆ ... ┆ 4.38 ┆ 1.05 ┆ 3.4 ┆ 1050 │
|
||||
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
||||
│ 1 ┆ 13.16 ┆ 2.36 ┆ 2.67 ┆ ... ┆ 5.68 ┆ 1.03 ┆ 3.17 ┆ 1185 │
|
||||
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
||||
│ 1 ┆ 14.37 ┆ 1.95 ┆ 2.5 ┆ ... ┆ 7.8 ┆ 0.86 ┆ 3.45 ┆ 1480 │
|
||||
└──────┴─────────┴────────────┴──────┴─────┴───────────┴──────┴──────┴─────────┘
|
||||
```
|
||||
|
||||
|
||||
## Performance
|
||||
|
||||
`dr` is implemented in Rust with the goal of achieving the highest possible performance. Take for instance a simple read, groupby, and aggregate operation with ~30MB of data:
|
||||
|
||||
```bash
|
||||
$ time cat data/walmart_train.csv | ./target/release/dr sql "select Dept, avg("Weekly_Sales") from this group by Dept" | ./target/release/dr print
|
||||
$ time cat data/walmart_train.csv | dr sql "select Dept, avg("Weekly_Sales") from this group by Dept" | dr print
|
||||
shape: (81, 2)
|
||||
┌──────┬──────────────┐
|
||||
│ Dept ┆ Weekly_Sales │
|
||||
|
|
27
src/io.rs
27
src/io.rs
|
@ -39,3 +39,30 @@ pub fn read_parquet(path: String) -> DataFrame {
|
|||
};
|
||||
df
|
||||
}
|
||||
|
||||
/// Write a Polars DataFrame to Parquet
|
||||
pub fn write_parquet(
|
||||
mut df: DataFrame,
|
||||
path: String,
|
||||
compression: String,
|
||||
statistics: bool,
|
||||
chunksize: Option<usize>,
|
||||
) {
|
||||
// Selected compression not implemented yet
|
||||
let mut _file = match fs::File::create(path) {
|
||||
Ok(mut file) => {
|
||||
let mut w = ParquetWriter::new(&mut file);
|
||||
if statistics {
|
||||
w = w.with_statistics(statistics);
|
||||
}
|
||||
if chunksize.unwrap_or(0) > 0 {
|
||||
w = w.with_row_group_size(chunksize);
|
||||
}
|
||||
let _r = match w.finish(&mut df) {
|
||||
Ok(_r) => (),
|
||||
Err(e) => eprintln!("{e}"),
|
||||
};
|
||||
}
|
||||
Err(e) => eprintln!("{e}"),
|
||||
};
|
||||
}
|
||||
|
|
12
src/main.rs
12
src/main.rs
|
@ -16,6 +16,11 @@ fn main() {
|
|||
.about("Read parquet file")
|
||||
.arg(arg!([path] "Path to the parquet file")),
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("wpq")
|
||||
.about("Write to a paquet file")
|
||||
.arg(arg!([path] "Path to the new parquet file")),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
if let Some(matches) = matches.subcommand_matches("sql") {
|
||||
|
@ -40,6 +45,13 @@ fn main() {
|
|||
} else {
|
||||
eprintln!("File not found")
|
||||
}
|
||||
} else if let Some(matches) = matches.subcommand_matches("wpq") {
|
||||
if let Some(path) = matches.get_one::<String>("path") {
|
||||
let df = io::load_csv_from_stdin();
|
||||
io::write_parquet(df, path.to_string(), "lz4raw".to_string(), true, Some(0));
|
||||
} else {
|
||||
eprintln!("Could now write to parquet");
|
||||
}
|
||||
} else {
|
||||
println!("No command provided. Please execute dr --help")
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue