Added parquet functionality. Version bump
This commit is contained in:
parent
c1d46c68a4
commit
1a2bb27141
|
@ -1,7 +1,7 @@
|
||||||
[package]
|
[package]
|
||||||
name = "dr"
|
name = "dr"
|
||||||
description = "Command-line data file processing in Rust"
|
description = "Command-line data file processing in Rust"
|
||||||
version = "0.2.1"
|
version = "0.3.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
include = [
|
include = [
|
||||||
"**/*.rs",
|
"**/*.rs",
|
||||||
|
|
31
README.md
31
README.md
|
@ -53,6 +53,7 @@ Commands:
|
||||||
sql Runs a sql statement on the file
|
sql Runs a sql statement on the file
|
||||||
print Pretty prints the table
|
print Pretty prints the table
|
||||||
rpq Read parquet file
|
rpq Read parquet file
|
||||||
|
wpq Write to a parquet file
|
||||||
help Print this message or the help of the given subcommand(s)
|
help Print this message or the help of the given subcommand(s)
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
|
@ -82,13 +83,41 @@ shape: (4, 14)
|
||||||
|
|
||||||
Note that when `dr` loads csv data also tries to guess the data type of each field.
|
Note that when `dr` loads csv data also tries to guess the data type of each field.
|
||||||
|
|
||||||
|
### Parquet
|
||||||
|
|
||||||
|
`dr` is also useful to translate your csv files to parquet with a single command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ cat wine.csv | dr wpq wine.pq
|
||||||
|
```
|
||||||
|
|
||||||
|
Or explore parquet files
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ dr rpq wine.pq | head -n 5 | dr print
|
||||||
|
shape: (4, 14)
|
||||||
|
┌──────┬─────────┬────────────┬──────┬─────┬───────────┬──────┬──────┬─────────┐
|
||||||
|
│ Wine ┆ Alcohol ┆ Malic.acid ┆ Ash ┆ ... ┆ Color.int ┆ Hue ┆ OD ┆ Proline │
|
||||||
|
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
|
||||||
|
│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ i64 │
|
||||||
|
╞══════╪═════════╪════════════╪══════╪═════╪═══════════╪══════╪══════╪═════════╡
|
||||||
|
│ 1 ┆ 14.23 ┆ 1.71 ┆ 2.43 ┆ ... ┆ 5.64 ┆ 1.04 ┆ 3.92 ┆ 1065 │
|
||||||
|
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
||||||
|
│ 1 ┆ 13.2 ┆ 1.78 ┆ 2.14 ┆ ... ┆ 4.38 ┆ 1.05 ┆ 3.4 ┆ 1050 │
|
||||||
|
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
||||||
|
│ 1 ┆ 13.16 ┆ 2.36 ┆ 2.67 ┆ ... ┆ 5.68 ┆ 1.03 ┆ 3.17 ┆ 1185 │
|
||||||
|
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
||||||
|
│ 1 ┆ 14.37 ┆ 1.95 ┆ 2.5 ┆ ... ┆ 7.8 ┆ 0.86 ┆ 3.45 ┆ 1480 │
|
||||||
|
└──────┴─────────┴────────────┴──────┴─────┴───────────┴──────┴──────┴─────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## Performance
|
## Performance
|
||||||
|
|
||||||
`dr` is implemented in Rust with the goal of achieving the highest possible performance. Take for instance a simple read, groupby, and aggregate operation with ~30MB of data:
|
`dr` is implemented in Rust with the goal of achieving the highest possible performance. Take for instance a simple read, groupby, and aggregate operation with ~30MB of data:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ time cat data/walmart_train.csv | ./target/release/dr sql "select Dept, avg("Weekly_Sales") from this group by Dept" | ./target/release/dr print
|
$ time cat data/walmart_train.csv | dr sql "select Dept, avg("Weekly_Sales") from this group by Dept" | dr print
|
||||||
shape: (81, 2)
|
shape: (81, 2)
|
||||||
┌──────┬──────────────┐
|
┌──────┬──────────────┐
|
||||||
│ Dept ┆ Weekly_Sales │
|
│ Dept ┆ Weekly_Sales │
|
||||||
|
|
27
src/io.rs
27
src/io.rs
|
@ -39,3 +39,30 @@ pub fn read_parquet(path: String) -> DataFrame {
|
||||||
};
|
};
|
||||||
df
|
df
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Write a Polars DataFrame to Parquet
|
||||||
|
pub fn write_parquet(
|
||||||
|
mut df: DataFrame,
|
||||||
|
path: String,
|
||||||
|
compression: String,
|
||||||
|
statistics: bool,
|
||||||
|
chunksize: Option<usize>,
|
||||||
|
) {
|
||||||
|
// Selected compression not implemented yet
|
||||||
|
let mut _file = match fs::File::create(path) {
|
||||||
|
Ok(mut file) => {
|
||||||
|
let mut w = ParquetWriter::new(&mut file);
|
||||||
|
if statistics {
|
||||||
|
w = w.with_statistics(statistics);
|
||||||
|
}
|
||||||
|
if chunksize.unwrap_or(0) > 0 {
|
||||||
|
w = w.with_row_group_size(chunksize);
|
||||||
|
}
|
||||||
|
let _r = match w.finish(&mut df) {
|
||||||
|
Ok(_r) => (),
|
||||||
|
Err(e) => eprintln!("{e}"),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
Err(e) => eprintln!("{e}"),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
12
src/main.rs
12
src/main.rs
|
@ -16,6 +16,11 @@ fn main() {
|
||||||
.about("Read parquet file")
|
.about("Read parquet file")
|
||||||
.arg(arg!([path] "Path to the parquet file")),
|
.arg(arg!([path] "Path to the parquet file")),
|
||||||
)
|
)
|
||||||
|
.subcommand(
|
||||||
|
Command::new("wpq")
|
||||||
|
.about("Write to a paquet file")
|
||||||
|
.arg(arg!([path] "Path to the new parquet file")),
|
||||||
|
)
|
||||||
.get_matches();
|
.get_matches();
|
||||||
|
|
||||||
if let Some(matches) = matches.subcommand_matches("sql") {
|
if let Some(matches) = matches.subcommand_matches("sql") {
|
||||||
|
@ -40,6 +45,13 @@ fn main() {
|
||||||
} else {
|
} else {
|
||||||
eprintln!("File not found")
|
eprintln!("File not found")
|
||||||
}
|
}
|
||||||
|
} else if let Some(matches) = matches.subcommand_matches("wpq") {
|
||||||
|
if let Some(path) = matches.get_one::<String>("path") {
|
||||||
|
let df = io::load_csv_from_stdin();
|
||||||
|
io::write_parquet(df, path.to_string(), "lz4raw".to_string(), true, Some(0));
|
||||||
|
} else {
|
||||||
|
eprintln!("Could now write to parquet");
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
println!("No command provided. Please execute dr --help")
|
println!("No command provided. Please execute dr --help")
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue