(arrow) branch main updated: GH-28994: [C++][JSON] Change the max rows to Unlimited(int_32) (#38582)

bkietz Mon, 27 Nov 2023 07:52:39 -0800

This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new ca46557649 GH-28994: [C++][JSON] Change the max rows to 
Unlimited(int_32)  (#38582)
ca46557649 is described below

commit ca4655764900f3e216d4d0a9586a03b78dee7f01
Author: zhipeng <[email protected]>
AuthorDate: Mon Nov 27 23:52:27 2023 +0800

    GH-28994: [C++][JSON] Change the max rows to Unlimited(int_32)  (#38582)
    
    
    ### Rationale for this change
    Unlimited parse max rows for parse json block. Raise `Row count overflowed 
int32_t` error when the loop times out of `init_32::max()`.
    
    See issue: #28994
    
    ### What changes are included in this PR?
    Delete const `kMaxParserNumRows`,  Minor code( C++ ) modifications and test 
code(python)
    
    ### Are these changes tested? Yes
    New a test code for parse large (100100) rows json .
    
    ### Are there any user-facing changes?  No
    
    * Closes: #28994
    
    Lead-authored-by: zhipeng <[email protected]>
    Co-authored-by: zhipeng <[email protected]>
    Signed-off-by: Benjamin Kietzman <[email protected]>
---
 cpp/src/arrow/dataset/file_json_test.cc | 2 +-
 cpp/src/arrow/json/parser.cc            | 6 +++---
 cpp/src/arrow/json/parser.h             | 2 --
 cpp/src/arrow/json/parser_benchmark.cc  | 2 --
 python/pyarrow/tests/test_json.py       | 8 ++++++++
 5 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_json_test.cc 
b/cpp/src/arrow/dataset/file_json_test.cc
index 2b4fcdd82f..3b0647d28f 100644
--- a/cpp/src/arrow/dataset/file_json_test.cc
+++ b/cpp/src/arrow/dataset/file_json_test.cc
@@ -245,7 +245,7 @@ class JsonScanMixin {
 
 // Use a reduced number of rows in valgrind to avoid timeouts.
 #ifndef ARROW_VALGRIND
-constexpr static int64_t kTestMaxNumRows = json::kMaxParserNumRows;
+constexpr static int64_t kTestMaxNumRows = (1UL << 17);
 #else
 constexpr static int64_t kTestMaxNumRows = 1024;
 #endif
diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc
index 185dcde355..761ff02dd4 100644
--- a/cpp/src/arrow/json/parser.cc
+++ b/cpp/src/arrow/json/parser.cc
@@ -769,8 +769,8 @@ class HandlerBase : public BlockParser,
                                  rj::kParseNumbersAsStringsFlag;
 
     rj::Reader reader;
-
-    for (; num_rows_ < kMaxParserNumRows; ++num_rows_) {
+    // ensure that the loop can exit when the block too large.
+    for (; num_rows_ < std::numeric_limits<int32_t>::max(); ++num_rows_) {
       auto ok = reader.Parse<parse_flags>(json, handler);
       switch (ok.Code()) {
         case rj::kParseErrorNone:
@@ -790,7 +790,7 @@ class HandlerBase : public BlockParser,
           return ParseError(rj::GetParseError_En(ok.Code()), " in row ", 
num_rows_);
       }
     }
-    return Status::Invalid("Exceeded maximum rows");
+    return Status::Invalid("Row count overflowed int32_t");
   }
 
   template <typename Handler>
diff --git a/cpp/src/arrow/json/parser.h b/cpp/src/arrow/json/parser.h
index e21d09c416..aca416dbb7 100644
--- a/cpp/src/arrow/json/parser.h
+++ b/cpp/src/arrow/json/parser.h
@@ -56,8 +56,6 @@ struct Kind {
   static Status ForType(const DataType& type, Kind::type* kind);
 };
 
-constexpr int32_t kMaxParserNumRows = 100000;
-
 /// \class BlockParser
 /// \brief A reusable block-based parser for JSON data
 ///
diff --git a/cpp/src/arrow/json/parser_benchmark.cc 
b/cpp/src/arrow/json/parser_benchmark.cc
index 2a1629ef8e..a5a6eb68e6 100644
--- a/cpp/src/arrow/json/parser_benchmark.cc
+++ b/cpp/src/arrow/json/parser_benchmark.cc
@@ -200,8 +200,6 @@ static void ParseJSONFields(benchmark::State& state) {  // 
NOLINT non-const refe
   int32_t num_rows = static_cast<int32_t>(2e4 / (1.0 - sparsity) / num_fields);
   // ... however, we want enough rows to make setup/finish overhead negligible
   num_rows = std::max<int32_t>(num_rows, 200);
-  // ... and also we want to avoid an "Exceeded maximum rows" error.
-  num_rows = std::min<int32_t>(num_rows, kMaxParserNumRows);
   // In the end, we will empirically generate between 400 kB and 4 MB of JSON 
data.
 
   auto fields = GenerateTestFields(num_fields, 10);
diff --git a/python/pyarrow/tests/test_json.py 
b/python/pyarrow/tests/test_json.py
index be83f891a2..b8c1e874fc 100644
--- a/python/pyarrow/tests/test_json.py
+++ b/python/pyarrow/tests/test_json.py
@@ -304,6 +304,14 @@ class BaseTestJSONRead:
         assert table.equals(expected)
         assert table.to_pydict() == expected.to_pydict()
 
+    def test_load_large_json(self):
+        data, expected = make_random_json(num_cols=2, num_rows=100100)
+        # set block size is 10MB
+        read_options = ReadOptions(block_size=1024*1024*10)
+        table = self.read_bytes(data, read_options=read_options)
+        assert table.num_rows == 100100
+        assert expected.num_rows == 100100
+
     def test_stress_block_sizes(self):
         # Test a number of small block sizes to stress block stitching
         data_base, expected = make_random_json(num_cols=2, num_rows=100)

(arrow) branch main updated: GH-28994: [C++][JSON] Change the max rows to Unlimited(int_32) (#38582)

Reply via email to