This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 65f7be8560 Return row count when inferring schema from JSON (#5008)
65f7be8560 is described below

commit 65f7be856099d389b0d0eafa9be47fad25215ee6
Author: Alex Sayers <[email protected]>
AuthorDate: Wed Nov 1 05:12:08 2023 +0900

    Return row count when inferring schema from JSON (#5008)
    
    * Return row count when inferring schema from JSON
    
    * Add some unit tests for arrow-json's row-count
---
 arrow-json/src/reader/mod.rs    |  6 +++---
 arrow-json/src/reader/schema.rs | 33 ++++++++++++++++++++++++++-------
 arrow-json/src/writer.rs        |  4 ++--
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs
index 1225e51b3a..28282c4d15 100644
--- a/arrow-json/src/reader/mod.rs
+++ b/arrow-json/src/reader/mod.rs
@@ -1562,7 +1562,7 @@ mod tests {
         let file = File::open(path).unwrap();
         let mut reader = BufReader::new(file);
         let schema = schema.unwrap_or_else(|| {
-            let schema = infer_json_schema(&mut reader, None).unwrap();
+            let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
             reader.rewind().unwrap();
             schema
         });
@@ -1939,7 +1939,7 @@ mod tests {
     fn test_with_multiple_batches() {
         let file = File::open("test/data/basic_nulls.json").unwrap();
         let mut reader = BufReader::new(file);
-        let schema = infer_json_schema(&mut reader, None).unwrap();
+        let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
         reader.rewind().unwrap();
 
         let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5);
@@ -2079,7 +2079,7 @@ mod tests {
     fn test_json_iterator() {
         let file = File::open("test/data/basic.json").unwrap();
         let mut reader = BufReader::new(file);
-        let schema = infer_json_schema(&mut reader, None).unwrap();
+        let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
         reader.rewind().unwrap();
 
         let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5);
diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs
index 58aa08014d..97f1a0f295 100644
--- a/arrow-json/src/reader/schema.rs
+++ b/arrow-json/src/reader/schema.rs
@@ -209,6 +209,8 @@ impl<R: BufRead> Iterator for ValueIter<R> {
 ///
 /// If `max_read_records` is not set, the whole file is read to infer its 
field types.
 ///
+/// Returns inferred schema and number of records read.
+///
 /// Contrary to [`infer_json_schema`], this function will seek back to the 
start of the `reader`.
 /// That way, the `reader` can be used immediately afterwards to create a 
[`Reader`].
 ///
@@ -229,7 +231,7 @@ impl<R: BufRead> Iterator for ValueIter<R> {
 pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
     mut reader: R,
     max_read_records: Option<usize>,
-) -> Result<Schema, ArrowError> {
+) -> Result<(Schema, usize), ArrowError> {
     let schema = infer_json_schema(&mut reader, max_read_records);
     // return the reader seek back to the start
     reader.rewind()?;
@@ -242,6 +244,8 @@ pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
 ///
 /// If `max_read_records` is not set, the whole file is read to infer its 
field types.
 ///
+/// Returns inferred schema and number of records read.
+///
 /// This function will not seek back to the start of the `reader`. The user 
has to manage the
 /// original file's cursor. This function is useful when the `reader`'s cursor 
is not available
 /// (does not implement [`Seek`]), such is the case for compressed streams 
decoders.
@@ -266,8 +270,10 @@ pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
 pub fn infer_json_schema<R: BufRead>(
     reader: R,
     max_read_records: Option<usize>,
-) -> Result<Schema, ArrowError> {
-    infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records))
+) -> Result<(Schema, usize), ArrowError> {
+    let mut values = ValueIter::new(reader, max_read_records);
+    let schema = infer_json_schema_from_iterator(&mut values)?;
+    Ok((schema, values.record_count))
 }
 
 fn set_object_scalar_field_type(
@@ -522,15 +528,28 @@ mod tests {
         ]);
 
         let mut reader = 
BufReader::new(File::open("test/data/mixed_arrays.json").unwrap());
-        let inferred_schema = infer_json_schema_from_seekable(&mut reader, 
None).unwrap();
+        let (inferred_schema, n_rows) = infer_json_schema_from_seekable(&mut 
reader, None).unwrap();
 
         assert_eq!(inferred_schema, schema);
+        assert_eq!(n_rows, 4);
 
         let file = File::open("test/data/mixed_arrays.json.gz").unwrap();
         let mut reader = BufReader::new(GzDecoder::new(&file));
-        let inferred_schema = infer_json_schema(&mut reader, None).unwrap();
+        let (inferred_schema, n_rows) = infer_json_schema(&mut reader, 
None).unwrap();
 
         assert_eq!(inferred_schema, schema);
+        assert_eq!(n_rows, 4);
+    }
+
+    #[test]
+    fn test_row_limit() {
+        let mut reader = 
BufReader::new(File::open("test/data/basic.json").unwrap());
+
+        let (_, n_rows) = infer_json_schema_from_seekable(&mut reader, 
None).unwrap();
+        assert_eq!(n_rows, 12);
+
+        let (_, n_rows) = infer_json_schema_from_seekable(&mut reader, 
Some(5)).unwrap();
+        assert_eq!(n_rows, 5);
     }
 
     #[test]
@@ -640,7 +659,7 @@ mod tests {
             bigger_than_i64_max, smaller_than_i64_min
         );
         let mut buf_reader = BufReader::new(json.as_bytes());
-        let inferred_schema = infer_json_schema(&mut buf_reader, 
Some(1)).unwrap();
+        let (inferred_schema, _) = infer_json_schema(&mut buf_reader, 
Some(1)).unwrap();
         let fields = inferred_schema.fields();
 
         let (_, big_field) = fields.find("bigger_than_i64_max").unwrap();
@@ -686,7 +705,7 @@ mod tests {
             {"in":null, "ni":2,    "ns":"3",  "sn":null, "n":null, "an":null, 
"na": [],   "nas":["8"]}
             {"in":1,    "ni":null, "ns":null, "sn":"4",  "n":null, "an":[],   
"na": null, "nas":[]}
         "#;
-        let inferred_schema =
+        let (inferred_schema, _) =
             infer_json_schema_from_seekable(Cursor::new(data), 
None).expect("infer");
         let schema = Schema::new(vec![
             Field::new("an", list_type_of(DataType::Null), true),
diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs
index 97a8b38d41..5ecfc93236 100644
--- a/arrow-json/src/writer.rs
+++ b/arrow-json/src/writer.rs
@@ -1206,7 +1206,7 @@ mod tests {
     fn test_write_for_file(test_file: &str) {
         let file = File::open(test_file).unwrap();
         let mut reader = BufReader::new(file);
-        let schema = infer_json_schema(&mut reader, None).unwrap();
+        let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
         reader.rewind().unwrap();
 
         let builder = 
ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024);
@@ -1391,7 +1391,7 @@ mod tests {
         let test_file = "test/data/basic.json";
         let file = File::open(test_file).unwrap();
         let mut reader = BufReader::new(file);
-        let schema = infer_json_schema(&mut reader, None).unwrap();
+        let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
         reader.rewind().unwrap();
 
         let builder = 
ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024);

Reply via email to