This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c546eef ARROW-8287: [Rust] Add "pretty" util to help with printing
tabular output of RecordBatches
c546eef is described below
commit c546eef41e6ab20c4ca29a2d836987959843896f
Author: Mark Hildreth <[email protected]>
AuthorDate: Wed Apr 29 16:57:53 2020 -0600
ARROW-8287: [Rust] Add "pretty" util to help with printing tabular output
of RecordBatches
Just a simple move of code from DataFusion to Arrow, and using it. I have a
few comments/questions on this [which I have put on the JIRA
issue](https://issues.apache.org/jira/browse/ARROW-8287?focusedCommentId=17086534&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17086534).
Closes #6972 from markhildreth/ARROW-8287
Authored-by: Mark Hildreth <[email protected]>
Signed-off-by: Andy Grove <[email protected]>
---
rust/arrow/Cargo.toml | 1 +
rust/arrow/examples/read_csv.rs | 37 +-----
rust/arrow/examples/read_csv_infer_schema.rs | 38 +-----
rust/arrow/src/util/mod.rs | 1 +
rust/arrow/src/util/pretty.rs | 177 +++++++++++++++++++++++++++
rust/parquet/src/encodings/rle.rs | 8 +-
6 files changed, 189 insertions(+), 73 deletions(-)
diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml
index 95a0dae..479cb6c 100644
--- a/rust/arrow/Cargo.toml
+++ b/rust/arrow/Cargo.toml
@@ -50,6 +50,7 @@ chrono = "0.4"
flatbuffers = "0.6"
hex = "0.4"
arrow-flight = { path = "../arrow-flight", optional = true }
+prettytable-rs = "0.8.0"
[features]
simd = ["packed_simd"]
diff --git a/rust/arrow/examples/read_csv.rs b/rust/arrow/examples/read_csv.rs
index 6a37b3d..cde59d7 100644
--- a/rust/arrow/examples/read_csv.rs
+++ b/rust/arrow/examples/read_csv.rs
@@ -20,11 +20,12 @@ extern crate arrow;
use std::fs::File;
use std::sync::Arc;
-use arrow::array::{Float64Array, StringArray};
use arrow::csv;
use arrow::datatypes::{DataType, Field, Schema};
+use arrow::error::Result;
+use arrow::util::pretty::print_batches;
-fn main() {
+fn main() -> Result<()> {
let schema = Schema::new(vec![
Field::new("city", DataType::Utf8, false),
Field::new("lat", DataType::Float64, false),
@@ -35,35 +36,5 @@ fn main() {
let mut csv = csv::Reader::new(file, Arc::new(schema), false, 1024, None);
let batch = csv.next().unwrap().unwrap();
-
- println!(
- "Loaded {} rows containing {} columns",
- batch.num_rows(),
- batch.num_columns()
- );
-
- let city = batch
- .column(0)
- .as_any()
- .downcast_ref::<StringArray>()
- .unwrap();
- let lat = batch
- .column(1)
- .as_any()
- .downcast_ref::<Float64Array>()
- .unwrap();
- let lng = batch
- .column(2)
- .as_any()
- .downcast_ref::<Float64Array>()
- .unwrap();
-
- for i in 0..batch.num_rows() {
- println!(
- "City: {}, Latitude: {}, Longitude: {}",
- city.value(i),
- lat.value(i),
- lng.value(i)
- );
- }
+ print_batches(&vec![batch])
}
diff --git a/rust/arrow/examples/read_csv_infer_schema.rs
b/rust/arrow/examples/read_csv_infer_schema.rs
index 6a25d2d..07c28c7 100644
--- a/rust/arrow/examples/read_csv_infer_schema.rs
+++ b/rust/arrow/examples/read_csv_infer_schema.rs
@@ -17,11 +17,12 @@
extern crate arrow;
-use arrow::array::{Float64Array, StringArray};
use arrow::csv;
+use arrow::error::Result;
+use arrow::util::pretty::print_batches;
use std::fs::File;
-fn main() {
+fn main() -> Result<()> {
let file = File::open("test/data/uk_cities_with_headers.csv").unwrap();
let builder = csv::ReaderBuilder::new()
.has_headers(true)
@@ -29,36 +30,5 @@ fn main() {
let mut csv = builder.build(file).unwrap();
let batch = csv.next().unwrap().unwrap();
- println!(
- "Loaded {} rows containing {} columns",
- batch.num_rows(),
- batch.num_columns()
- );
-
- println!("Inferred schema: {:?}", batch.schema());
-
- let city = batch
- .column(0)
- .as_any()
- .downcast_ref::<StringArray>()
- .unwrap();
- let lat = batch
- .column(1)
- .as_any()
- .downcast_ref::<Float64Array>()
- .unwrap();
- let lng = batch
- .column(2)
- .as_any()
- .downcast_ref::<Float64Array>()
- .unwrap();
-
- for i in 0..batch.num_rows() {
- println!(
- "City: {}, Latitude: {}, Longitude: {}",
- city.value(i),
- lat.value(i),
- lng.value(i)
- );
- }
+ print_batches(&vec![batch])
}
diff --git a/rust/arrow/src/util/mod.rs b/rust/arrow/src/util/mod.rs
index 982d42a..a66b3c3 100644
--- a/rust/arrow/src/util/mod.rs
+++ b/rust/arrow/src/util/mod.rs
@@ -17,5 +17,6 @@
pub mod bit_util;
pub(crate) mod integration_util;
+pub mod pretty;
pub mod string_writer;
pub mod test_util;
diff --git a/rust/arrow/src/util/pretty.rs b/rust/arrow/src/util/pretty.rs
new file mode 100644
index 0000000..7416aaa
--- /dev/null
+++ b/rust/arrow/src/util/pretty.rs
@@ -0,0 +1,177 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for printing record batches
+
+use crate::array;
+use crate::datatypes::{DataType, TimeUnit};
+use crate::record_batch::RecordBatch;
+
+use prettytable::format;
+use prettytable::{Cell, Row, Table};
+
+use crate::error::{ArrowError, Result};
+
+///! Create a visual representation of record batches
+pub fn pretty_format_batches(results: &Vec<RecordBatch>) -> Result<String> {
+ Ok(create_table(results)?.to_string())
+}
+
+///! Prints a visual representation of record batches to stdout
+pub fn print_batches(results: &Vec<RecordBatch>) -> Result<()> {
+ create_table(results)?.printstd();
+ Ok(())
+}
+
+///! Convert a series of record batches into a table
+fn create_table(results: &Vec<RecordBatch>) -> Result<Table> {
+ let mut table = Table::new();
+ table.set_format(*format::consts::FORMAT_NO_LINESEP_WITH_TITLE);
+
+ if results.is_empty() {
+ return Ok(table);
+ }
+
+ let schema = results[0].schema();
+
+ let mut header = Vec::new();
+ for field in schema.fields() {
+ header.push(Cell::new(&field.name()));
+ }
+ table.set_titles(Row::new(header));
+
+ for batch in results {
+ for row in 0..batch.num_rows() {
+ let mut cells = Vec::new();
+ for col in 0..batch.num_columns() {
+ let column = batch.column(col);
+ cells.push(Cell::new(&array_value_to_string(column.clone(),
row)?));
+ }
+ table.add_row(Row::new(cells));
+ }
+ }
+
+ Ok(table)
+}
+
+macro_rules! make_string {
+ ($array_type:ty, $column: ident, $row: ident) => {{
+ Ok($column
+ .as_any()
+ .downcast_ref::<$array_type>()
+ .unwrap()
+ .value($row)
+ .to_string())
+ }};
+}
+
+/// Get the value at the given row in an array as a string
+fn array_value_to_string(column: array::ArrayRef, row: usize) ->
Result<String> {
+ match column.data_type() {
+ DataType::Utf8 => Ok(column
+ .as_any()
+ .downcast_ref::<array::StringArray>()
+ .unwrap()
+ .value(row)
+ .to_string()),
+ DataType::Boolean => make_string!(array::BooleanArray, column, row),
+ DataType::Int16 => make_string!(array::Int16Array, column, row),
+ DataType::Int32 => make_string!(array::Int32Array, column, row),
+ DataType::Int64 => make_string!(array::Int64Array, column, row),
+ DataType::UInt8 => make_string!(array::UInt8Array, column, row),
+ DataType::UInt16 => make_string!(array::UInt16Array, column, row),
+ DataType::UInt32 => make_string!(array::UInt32Array, column, row),
+ DataType::UInt64 => make_string!(array::UInt64Array, column, row),
+ DataType::Float16 => make_string!(array::Float32Array, column, row),
+ DataType::Float32 => make_string!(array::Float32Array, column, row),
+ DataType::Float64 => make_string!(array::Float64Array, column, row),
+ DataType::Timestamp(unit, _) if *unit == TimeUnit::Second => {
+ make_string!(array::TimestampSecondArray, column, row)
+ }
+ DataType::Timestamp(unit, _) if *unit == TimeUnit::Millisecond => {
+ make_string!(array::TimestampMillisecondArray, column, row)
+ }
+ DataType::Timestamp(unit, _) if *unit == TimeUnit::Microsecond => {
+ make_string!(array::TimestampMicrosecondArray, column, row)
+ }
+ DataType::Timestamp(unit, _) if *unit == TimeUnit::Nanosecond => {
+ make_string!(array::TimestampNanosecondArray, column, row)
+ }
+ DataType::Date32(_) => make_string!(array::Date32Array, column, row),
+ DataType::Date64(_) => make_string!(array::Date64Array, column, row),
+ DataType::Time32(unit) if *unit == TimeUnit::Second => {
+ make_string!(array::Time32SecondArray, column, row)
+ }
+ DataType::Time32(unit) if *unit == TimeUnit::Millisecond => {
+ make_string!(array::Time32MillisecondArray, column, row)
+ }
+ DataType::Time32(unit) if *unit == TimeUnit::Microsecond => {
+ make_string!(array::Time64MicrosecondArray, column, row)
+ }
+ DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => {
+ make_string!(array::Time64NanosecondArray, column, row)
+ }
+ _ => Err(ArrowError::InvalidArgumentError(format!(
+ "Unsupported {:?} type for repl.",
+ column.data_type()
+ ))),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::datatypes::{Field, Schema};
+ use std::sync::Arc;
+
+ #[test]
+ fn test_pretty_format_batches() -> Result<()> {
+ // define a schema.
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("a", DataType::Utf8, false),
+ Field::new("b", DataType::Int32, false),
+ ]));
+
+ // define data.
+ let batch = RecordBatch::try_new(
+ schema.clone(),
+ vec![
+ Arc::new(array::StringArray::from(vec!["a", "b", "c", "d"])),
+ Arc::new(array::Int32Array::from(vec![1, 10, 10, 100])),
+ ],
+ )?;
+
+ let table = pretty_format_batches(&vec![batch])?;
+
+ let expected = vec![
+ "+---+-----+",
+ "| a | b |",
+ "+---+-----+",
+ "| a | 1 |",
+ "| b | 10 |",
+ "| c | 10 |",
+ "| d | 100 |",
+ "+---+-----+",
+ ];
+
+ let actual: Vec<&str> = table.lines().collect();
+
+ assert_eq!(expected, actual);
+
+ Ok(())
+ }
+}
diff --git a/rust/parquet/src/encodings/rle.rs
b/rust/parquet/src/encodings/rle.rs
index 26df49f..a965f54 100644
--- a/rust/parquet/src/encodings/rle.rs
+++ b/rust/parquet/src/encodings/rle.rs
@@ -522,11 +522,7 @@ impl RleDecoder {
mod tests {
use super::*;
- use rand::{
- self,
- distributions::{Distribution, Standard},
- thread_rng, Rng, SeedableRng,
- };
+ use rand::{self, distributions::Standard, thread_rng, Rng, SeedableRng};
use crate::util::memory::ByteBufferPtr;
@@ -830,7 +826,7 @@ mod tests {
values.clear();
let mut rng = thread_rng();
let seed_vec: Vec<u8> =
- Standard.sample_iter(&mut rng).take(seed_len).collect();
+ rng.sample_iter::<u8, _>(&Standard).take(seed_len).collect();
let mut seed = [0u8; 32];
seed.copy_from_slice(&seed_vec[0..seed_len]);
let mut gen = rand::rngs::StdRng::from_seed(seed);