[GitHub] [arrow] alamb commented on a change in pull request #9306: ARROW-10297: [Rust] Parameter for parquet-read to output data in json format

GitBox Tue, 26 Jan 2021 04:16:55 -0800


alamb commented on a change in pull request #9306:
URL: https://github.com/apache/arrow/pull/9306#discussion_r564465581




##########
File path: rust/parquet/src/bin/parquet-read.rs
##########
@@ -34,39 +34,98 @@
 //! ```
 //!
 //! # Usage
-//!
 //! ```
-//! parquet-read <file-path> [num-records]
+//!  parquet-read <file-path> [num-records]
 //! ```
-//! where `file-path` is the path to a Parquet file and `num-records` is the 
optional
-//! numeric option that allows to specify number of records to read from a 
file.
-//! When not provided, all records are read.
+//!
+//! ## Flags
+//!     -h, --help       Prints help information
+//!     -j, --json       Print parquet file in JSON lines Format
+//!     -V, --version    Prints version information
+//!
+//! ## Args
+//!     <file-path>      Path to a parquet file
+//!     <num-records>    Number of records to read. When not provided, all 
records are read.
 //!
 //! Note that `parquet-read` reads full file schema, no projection or 
filtering is
 //! applied.
 
 extern crate parquet;
 
-use std::{env, fs::File, path::Path, process};
+use std::{env, fs::File, path::Path};
+
+use clap::{crate_authors, crate_version, App, Arg};
 
 use parquet::file::reader::{FileReader, SerializedFileReader};
+use parquet::record::Row;
 
 fn main() {
-    let args: Vec<String> = env::args().collect();
-    if args.len() != 2 && args.len() != 3 {
-        println!("Usage: parquet-read <file-path> [num-records]");
-        process::exit(1);
-    }
+    #[cfg(not(feature = "json_output"))]
+    let app = App::new("parquet-read")
+        .version(crate_version!())
+        .author(crate_authors!())
+        .about("Read data from parquet file")
+        .arg(
+            Arg::with_name("file_path")
+                .value_name("file-path")
+                .required(true)
+                .index(1)
+                .help("Path to a parquet file"),
+        )
+        .arg(
+            Arg::with_name("num_records")
+                .value_name("num-records")
+                .index(2)
+                .help(
+                    "Number of records to read. When not provided, all records 
are read.",
+                ),
+        );
+
+    #[cfg(feature = "json_output")]
+    let app = App::new("parquet-read")
+        .version(crate_version!())
+        .author(crate_authors!())
+        .about("Read data from parquet file")
+        .arg(
+            Arg::with_name("file_path")
+                .value_name("file-path")
+                .required(true)
+                .index(1)
+                .help("Path to a parquet file"),
+        )
+        .arg(
+            Arg::with_name("num_records")
+                .value_name("num-records")
+                .index(2)
+                .help(
+                    "Number of records to read. When not provided, all records 
are read.",
+                ),
+        )
+        .arg(

Review comment:
       I wonder if it would make sense to try to reduce the redundancy. Maybe 
something like 
   ```
   let app = App::new("parquet-read")...;
   
   #[cfg(feature = "json_output")]
   let app = app.arg(Arg::with_name("json")...)
   
   ```
   
   

##########
File path: rust/parquet/Cargo.toml
##########
@@ -43,6 +43,8 @@ chrono = "0.4"
 num-bigint = "0.3"
 arrow = { path = "../arrow", version = "4.0.0-SNAPSHOT", optional = true }
 base64 = { version = "0.12", optional = true }
+clap = "2.33.3"

Review comment:
       would it be possible to make the `clap` dependency optional as well? Or 
maybe we can move the command line tools into their own feature?  The hope for 
parquet is that it becomes a widely used library at the lower levels of a 
software stack so I think keeping dependencies minimal is a good goal
   
   @nevi-me  what do you think?

##########
File path: rust/parquet/src/record/api.rs
##########
@@ -635,6 +648,55 @@ impl Field {
             _ => nyi!(descr, value),
         }
     }
+
+    #[cfg(feature = "json_output")]
+    pub fn to_json_value(&self) -> Value {
+        match &self {
+            Field::Null => Value::Null,
+            Field::Bool(b) => Value::Bool(*b),
+            Field::Byte(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Short(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Int(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Long(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::UByte(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::UShort(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::UInt(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::ULong(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Float(n) => serde_json::Number::from_f64(*n as f64)
+                .map(Value::Number)
+                .unwrap_or(Value::Null),
+            Field::Double(n) => serde_json::Number::from_f64(*n)
+                .map(Value::Number)
+                .unwrap_or(Value::Null),
+            Field::Decimal(n) => Value::String(convert_decimal_to_string(&n)),
+            Field::Str(s) => Value::String(s.to_owned()),
+            Field::Bytes(b) => Value::String(base64::encode(b.data())),
+            Field::Date(d) => Value::String(convert_date_to_string(*d)),
+            Field::TimestampMillis(ts) => {
+                Value::String(convert_timestamp_millis_to_string(*ts))
+            }
+            Field::TimestampMicros(ts) => {
+                Value::String(convert_timestamp_micros_to_string(*ts))
+            }
+            Field::Group(row) => row.to_json_value(),
+            Field::ListInternal(fields) => {
+                Value::Array(fields.elements.iter().map(|f| 
f.to_json_value()).collect())
+            }
+            Field::MapInternal(map) => Value::Object(
+                map.entries
+                    .iter()
+                    .map(|(key_field, value_field)| {
+                        let key_val = key_field.to_json_value();
+                        let key_str = key_val
+                            .as_str()
+                            .map(|s| s.to_owned())
+                            .unwrap_or_else(|| key_val.to_string());
+                        (key_str, value_field.to_json_value())
+                    })
+                    .collect(),
+            ),
+        }
+    }
 }

Review comment:
       I suggest adding some tests to show this code working as well as avoid 
regressions in the future (aka avoid someone breaking this code accidentally in 
the future but not realizing it)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] alamb commented on a change in pull request #9306: ARROW-10297: [Rust] Parameter for parquet-read to output data in json format

Reply via email to