This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 65f7be8560 Return row count when inferring schema from JSON (#5008)
65f7be8560 is described below
commit 65f7be856099d389b0d0eafa9be47fad25215ee6
Author: Alex Sayers <[email protected]>
AuthorDate: Wed Nov 1 05:12:08 2023 +0900
Return row count when inferring schema from JSON (#5008)
* Return row count when inferring schema from JSON
* Add some unit tests for arrow-json's row-count
---
arrow-json/src/reader/mod.rs | 6 +++---
arrow-json/src/reader/schema.rs | 33 ++++++++++++++++++++++++++-------
arrow-json/src/writer.rs | 4 ++--
3 files changed, 31 insertions(+), 12 deletions(-)
diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs
index 1225e51b3a..28282c4d15 100644
--- a/arrow-json/src/reader/mod.rs
+++ b/arrow-json/src/reader/mod.rs
@@ -1562,7 +1562,7 @@ mod tests {
let file = File::open(path).unwrap();
let mut reader = BufReader::new(file);
let schema = schema.unwrap_or_else(|| {
- let schema = infer_json_schema(&mut reader, None).unwrap();
+ let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
reader.rewind().unwrap();
schema
});
@@ -1939,7 +1939,7 @@ mod tests {
fn test_with_multiple_batches() {
let file = File::open("test/data/basic_nulls.json").unwrap();
let mut reader = BufReader::new(file);
- let schema = infer_json_schema(&mut reader, None).unwrap();
+ let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
reader.rewind().unwrap();
let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5);
@@ -2079,7 +2079,7 @@ mod tests {
fn test_json_iterator() {
let file = File::open("test/data/basic.json").unwrap();
let mut reader = BufReader::new(file);
- let schema = infer_json_schema(&mut reader, None).unwrap();
+ let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
reader.rewind().unwrap();
let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5);
diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs
index 58aa08014d..97f1a0f295 100644
--- a/arrow-json/src/reader/schema.rs
+++ b/arrow-json/src/reader/schema.rs
@@ -209,6 +209,8 @@ impl<R: BufRead> Iterator for ValueIter<R> {
///
/// If `max_read_records` is not set, the whole file is read to infer its
field types.
///
+/// Returns inferred schema and number of records read.
+///
/// Contrary to [`infer_json_schema`], this function will seek back to the
start of the `reader`.
/// That way, the `reader` can be used immediately afterwards to create a
[`Reader`].
///
@@ -229,7 +231,7 @@ impl<R: BufRead> Iterator for ValueIter<R> {
pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
mut reader: R,
max_read_records: Option<usize>,
-) -> Result<Schema, ArrowError> {
+) -> Result<(Schema, usize), ArrowError> {
let schema = infer_json_schema(&mut reader, max_read_records);
// return the reader seek back to the start
reader.rewind()?;
@@ -242,6 +244,8 @@ pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
///
/// If `max_read_records` is not set, the whole file is read to infer its
field types.
///
+/// Returns inferred schema and number of records read.
+///
/// This function will not seek back to the start of the `reader`. The user
has to manage the
/// original file's cursor. This function is useful when the `reader`'s cursor
is not available
/// (does not implement [`Seek`]), such is the case for compressed streams
decoders.
@@ -266,8 +270,10 @@ pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
pub fn infer_json_schema<R: BufRead>(
reader: R,
max_read_records: Option<usize>,
-) -> Result<Schema, ArrowError> {
- infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records))
+) -> Result<(Schema, usize), ArrowError> {
+ let mut values = ValueIter::new(reader, max_read_records);
+ let schema = infer_json_schema_from_iterator(&mut values)?;
+ Ok((schema, values.record_count))
}
fn set_object_scalar_field_type(
@@ -522,15 +528,28 @@ mod tests {
]);
let mut reader =
BufReader::new(File::open("test/data/mixed_arrays.json").unwrap());
- let inferred_schema = infer_json_schema_from_seekable(&mut reader,
None).unwrap();
+ let (inferred_schema, n_rows) = infer_json_schema_from_seekable(&mut
reader, None).unwrap();
assert_eq!(inferred_schema, schema);
+ assert_eq!(n_rows, 4);
let file = File::open("test/data/mixed_arrays.json.gz").unwrap();
let mut reader = BufReader::new(GzDecoder::new(&file));
- let inferred_schema = infer_json_schema(&mut reader, None).unwrap();
+ let (inferred_schema, n_rows) = infer_json_schema(&mut reader,
None).unwrap();
assert_eq!(inferred_schema, schema);
+ assert_eq!(n_rows, 4);
+ }
+
+ #[test]
+ fn test_row_limit() {
+ let mut reader =
BufReader::new(File::open("test/data/basic.json").unwrap());
+
+ let (_, n_rows) = infer_json_schema_from_seekable(&mut reader,
None).unwrap();
+ assert_eq!(n_rows, 12);
+
+ let (_, n_rows) = infer_json_schema_from_seekable(&mut reader,
Some(5)).unwrap();
+ assert_eq!(n_rows, 5);
}
#[test]
@@ -640,7 +659,7 @@ mod tests {
bigger_than_i64_max, smaller_than_i64_min
);
let mut buf_reader = BufReader::new(json.as_bytes());
- let inferred_schema = infer_json_schema(&mut buf_reader,
Some(1)).unwrap();
+ let (inferred_schema, _) = infer_json_schema(&mut buf_reader,
Some(1)).unwrap();
let fields = inferred_schema.fields();
let (_, big_field) = fields.find("bigger_than_i64_max").unwrap();
@@ -686,7 +705,7 @@ mod tests {
{"in":null, "ni":2, "ns":"3", "sn":null, "n":null, "an":null,
"na": [], "nas":["8"]}
{"in":1, "ni":null, "ns":null, "sn":"4", "n":null, "an":[],
"na": null, "nas":[]}
"#;
- let inferred_schema =
+ let (inferred_schema, _) =
infer_json_schema_from_seekable(Cursor::new(data),
None).expect("infer");
let schema = Schema::new(vec![
Field::new("an", list_type_of(DataType::Null), true),
diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs
index 97a8b38d41..5ecfc93236 100644
--- a/arrow-json/src/writer.rs
+++ b/arrow-json/src/writer.rs
@@ -1206,7 +1206,7 @@ mod tests {
fn test_write_for_file(test_file: &str) {
let file = File::open(test_file).unwrap();
let mut reader = BufReader::new(file);
- let schema = infer_json_schema(&mut reader, None).unwrap();
+ let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
reader.rewind().unwrap();
let builder =
ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024);
@@ -1391,7 +1391,7 @@ mod tests {
let test_file = "test/data/basic.json";
let file = File::open(test_file).unwrap();
let mut reader = BufReader::new(file);
- let schema = infer_json_schema(&mut reader, None).unwrap();
+ let (schema, _) = infer_json_schema(&mut reader, None).unwrap();
reader.rewind().unwrap();
let builder =
ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024);