This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 791e5bd6e4 ARROW-17066: [C++][Python][Substrait]
"ignore_unknown_fields" should be specified when converting JSON to binary
(#13605)
791e5bd6e4 is described below
commit 791e5bd6e413c193a214237d042f4f721ccc0976
Author: Vibhatha Lakmal Abeykoon <[email protected]>
AuthorDate: Fri Jul 22 14:43:39 2022 +0530
ARROW-17066: [C++][Python][Substrait] "ignore_unknown_fields" should be
specified when converting JSON to binary (#13605)
Substrait is continously changing and it introduces may unknown fields in
the consumed plan. Since it is not practical to support all these fields
simultaneously, we have to include a way to ignore such fields. To support
that, the `ignore_unknown_fields` is added to the `JsonParseOptions`.
Authored-by: Vibhatha Abeykoon <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/engine/substrait/serde.cc | 6 ++--
python/pyarrow/tests/test_substrait.py | 63 ++++++++++++++++++++++++++++++---
2 files changed, 62 insertions(+), 7 deletions(-)
diff --git a/cpp/src/arrow/engine/substrait/serde.cc
b/cpp/src/arrow/engine/substrait/serde.cc
index af189da1bb..00901b5e95 100644
--- a/cpp/src/arrow/engine/substrait/serde.cc
+++ b/cpp/src/arrow/engine/substrait/serde.cc
@@ -320,9 +320,11 @@ Result<std::shared_ptr<Buffer>>
SubstraitFromJSON(util::string_view type_name,
std::string out;
google::protobuf::io::StringOutputStream out_stream{&out};
-
+ google::protobuf::util::JsonParseOptions json_opts;
+ json_opts.ignore_unknown_fields = true;
auto status = google::protobuf::util::JsonToBinaryStream(
- GetGeneratedTypeResolver(), type_url, &json_stream, &out_stream);
+ GetGeneratedTypeResolver(), type_url, &json_stream, &out_stream,
+ std::move(json_opts));
if (!status.ok()) {
return Status::Invalid("JsonToBinaryStream returned ", status);
diff --git a/python/pyarrow/tests/test_substrait.py
b/python/pyarrow/tests/test_substrait.py
index 98c206fd7e..f05d68a95a 100644
--- a/python/pyarrow/tests/test_substrait.py
+++ b/python/pyarrow/tests/test_substrait.py
@@ -33,6 +33,13 @@ except ImportError:
pytestmark = [pytest.mark.dataset, pytest.mark.substrait]
+def _write_dummy_data_to_disk(tmpdir, file_name, table):
+ path = os.path.join(str(tmpdir), file_name)
+ with pa.ipc.RecordBatchFileWriter(path, schema=table.schema) as writer:
+ writer.write_table(table)
+ return path
+
+
@pytest.mark.skipif(sys.platform == 'win32',
reason="ARROW-16392: file based URI is" +
" not fully supported for Windows")
@@ -65,12 +72,10 @@ def test_run_serialized_query(tmpdir):
]
}
"""
- # TODO: replace with ipc when the support is finalized in C++
- path = os.path.join(str(tmpdir), 'substrait_data.arrow')
- table = pa.table([[1, 2, 3, 4, 5]], names=['foo'])
- with pa.ipc.RecordBatchFileWriter(path, schema=table.schema) as writer:
- writer.write_table(table)
+ file_name = "read_data.arrow"
+ table = pa.table([[1, 2, 3, 4, 5]], names=['foo'])
+ path = _write_dummy_data_to_disk(tmpdir, file_name, table)
query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER", path))
buf = pa._substrait._parse_json_plan(query)
@@ -92,3 +97,51 @@ def test_invalid_plan():
exec_message = "Empty substrait plan is passed."
with pytest.raises(ArrowInvalid, match=exec_message):
substrait.run_query(buf)
+
+
[email protected](sys.platform == 'win32',
+ reason="ARROW-16392: file based URI is" +
+ " not fully supported for Windows")
+def test_binary_conversion_with_json_options(tmpdir):
+ substrait_query = """
+ {
+ "relations": [
+ {"rel": {
+ "read": {
+ "base_schema": {
+ "struct": {
+ "types": [
+ {"i64": {}}
+ ]
+ },
+ "names": [
+ "bar"
+ ]
+ },
+ "local_files": {
+ "items": [
+ {
+ "uri_file": "file://FILENAME_PLACEHOLDER",
+ "arrow": {},
+ "metadata" : {
+ "created_by" : {},
+ }
+ }
+ ]
+ }
+ }
+ }}
+ ]
+ }
+ """
+
+ file_name = "binary_json_data.arrow"
+ table = pa.table([[1, 2, 3, 4, 5]], names=['bar'])
+ path = _write_dummy_data_to_disk(tmpdir, file_name, table)
+ query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER", path))
+ buf = pa._substrait._parse_json_plan(tobytes(query))
+
+ reader = substrait.run_query(buf)
+ res_tb = reader.read_all()
+
+ assert table.select(["bar"]) == res_tb.select(["bar"])