This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new ca46557649 GH-28994: [C++][JSON] Change the max rows to
Unlimited(int_32) (#38582)
ca46557649 is described below
commit ca4655764900f3e216d4d0a9586a03b78dee7f01
Author: zhipeng <[email protected]>
AuthorDate: Mon Nov 27 23:52:27 2023 +0800
GH-28994: [C++][JSON] Change the max rows to Unlimited(int_32) (#38582)
### Rationale for this change
Unlimited parse max rows for parse json block. Raise `Row count overflowed
int32_t` error when the loop times out of `init_32::max()`.
See issue: #28994
### What changes are included in this PR?
Delete const `kMaxParserNumRows`, Minor code( C++ ) modifications and test
code(python)
### Are these changes tested? Yes
New a test code for parse large (100100) rows json .
### Are there any user-facing changes? No
* Closes: #28994
Lead-authored-by: zhipeng <[email protected]>
Co-authored-by: zhipeng <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
---
cpp/src/arrow/dataset/file_json_test.cc | 2 +-
cpp/src/arrow/json/parser.cc | 6 +++---
cpp/src/arrow/json/parser.h | 2 --
cpp/src/arrow/json/parser_benchmark.cc | 2 --
python/pyarrow/tests/test_json.py | 8 ++++++++
5 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/cpp/src/arrow/dataset/file_json_test.cc
b/cpp/src/arrow/dataset/file_json_test.cc
index 2b4fcdd82f..3b0647d28f 100644
--- a/cpp/src/arrow/dataset/file_json_test.cc
+++ b/cpp/src/arrow/dataset/file_json_test.cc
@@ -245,7 +245,7 @@ class JsonScanMixin {
// Use a reduced number of rows in valgrind to avoid timeouts.
#ifndef ARROW_VALGRIND
-constexpr static int64_t kTestMaxNumRows = json::kMaxParserNumRows;
+constexpr static int64_t kTestMaxNumRows = (1UL << 17);
#else
constexpr static int64_t kTestMaxNumRows = 1024;
#endif
diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc
index 185dcde355..761ff02dd4 100644
--- a/cpp/src/arrow/json/parser.cc
+++ b/cpp/src/arrow/json/parser.cc
@@ -769,8 +769,8 @@ class HandlerBase : public BlockParser,
rj::kParseNumbersAsStringsFlag;
rj::Reader reader;
-
- for (; num_rows_ < kMaxParserNumRows; ++num_rows_) {
+ // ensure that the loop can exit when the block too large.
+ for (; num_rows_ < std::numeric_limits<int32_t>::max(); ++num_rows_) {
auto ok = reader.Parse<parse_flags>(json, handler);
switch (ok.Code()) {
case rj::kParseErrorNone:
@@ -790,7 +790,7 @@ class HandlerBase : public BlockParser,
return ParseError(rj::GetParseError_En(ok.Code()), " in row ",
num_rows_);
}
}
- return Status::Invalid("Exceeded maximum rows");
+ return Status::Invalid("Row count overflowed int32_t");
}
template <typename Handler>
diff --git a/cpp/src/arrow/json/parser.h b/cpp/src/arrow/json/parser.h
index e21d09c416..aca416dbb7 100644
--- a/cpp/src/arrow/json/parser.h
+++ b/cpp/src/arrow/json/parser.h
@@ -56,8 +56,6 @@ struct Kind {
static Status ForType(const DataType& type, Kind::type* kind);
};
-constexpr int32_t kMaxParserNumRows = 100000;
-
/// \class BlockParser
/// \brief A reusable block-based parser for JSON data
///
diff --git a/cpp/src/arrow/json/parser_benchmark.cc
b/cpp/src/arrow/json/parser_benchmark.cc
index 2a1629ef8e..a5a6eb68e6 100644
--- a/cpp/src/arrow/json/parser_benchmark.cc
+++ b/cpp/src/arrow/json/parser_benchmark.cc
@@ -200,8 +200,6 @@ static void ParseJSONFields(benchmark::State& state) { //
NOLINT non-const refe
int32_t num_rows = static_cast<int32_t>(2e4 / (1.0 - sparsity) / num_fields);
// ... however, we want enough rows to make setup/finish overhead negligible
num_rows = std::max<int32_t>(num_rows, 200);
- // ... and also we want to avoid an "Exceeded maximum rows" error.
- num_rows = std::min<int32_t>(num_rows, kMaxParserNumRows);
// In the end, we will empirically generate between 400 kB and 4 MB of JSON
data.
auto fields = GenerateTestFields(num_fields, 10);
diff --git a/python/pyarrow/tests/test_json.py
b/python/pyarrow/tests/test_json.py
index be83f891a2..b8c1e874fc 100644
--- a/python/pyarrow/tests/test_json.py
+++ b/python/pyarrow/tests/test_json.py
@@ -304,6 +304,14 @@ class BaseTestJSONRead:
assert table.equals(expected)
assert table.to_pydict() == expected.to_pydict()
+ def test_load_large_json(self):
+ data, expected = make_random_json(num_cols=2, num_rows=100100)
+ # set block size is 10MB
+ read_options = ReadOptions(block_size=1024*1024*10)
+ table = self.read_bytes(data, read_options=read_options)
+ assert table.num_rows == 100100
+ assert expected.num_rows == 100100
+
def test_stress_block_sizes(self):
# Test a number of small block sizes to stress block stitching
data_base, expected = make_random_json(num_cols=2, num_rows=100)