[arrow] branch master updated: ARROW-17066: [C++][Python][Substrait] "ignore_unknown_fields" should be specified when converting JSON to binary (#13605)

apitrou Fri, 22 Jul 2022 02:13:52 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 791e5bd6e4 ARROW-17066: [C++][Python][Substrait] 
"ignore_unknown_fields" should be specified when converting JSON to binary 
(#13605)
791e5bd6e4 is described below

commit 791e5bd6e413c193a214237d042f4f721ccc0976
Author: Vibhatha Lakmal Abeykoon <[email protected]>
AuthorDate: Fri Jul 22 14:43:39 2022 +0530

    ARROW-17066: [C++][Python][Substrait] "ignore_unknown_fields" should be 
specified when converting JSON to binary (#13605)
    
    Substrait is continously changing and it introduces may unknown fields in 
the consumed plan. Since it is not practical to support all these fields 
simultaneously, we have to include a way to ignore such fields. To support 
that, the `ignore_unknown_fields` is added to the `JsonParseOptions`.
    
    
    Authored-by: Vibhatha Abeykoon <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/engine/substrait/serde.cc |  6 ++--
 python/pyarrow/tests/test_substrait.py  | 63 ++++++++++++++++++++++++++++++---
 2 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/engine/substrait/serde.cc 
b/cpp/src/arrow/engine/substrait/serde.cc
index af189da1bb..00901b5e95 100644
--- a/cpp/src/arrow/engine/substrait/serde.cc
+++ b/cpp/src/arrow/engine/substrait/serde.cc
@@ -320,9 +320,11 @@ Result<std::shared_ptr<Buffer>> 
SubstraitFromJSON(util::string_view type_name,
 
   std::string out;
   google::protobuf::io::StringOutputStream out_stream{&out};
-
+  google::protobuf::util::JsonParseOptions json_opts;
+  json_opts.ignore_unknown_fields = true;
   auto status = google::protobuf::util::JsonToBinaryStream(
-      GetGeneratedTypeResolver(), type_url, &json_stream, &out_stream);
+      GetGeneratedTypeResolver(), type_url, &json_stream, &out_stream,
+      std::move(json_opts));
 
   if (!status.ok()) {
     return Status::Invalid("JsonToBinaryStream returned ", status);
diff --git a/python/pyarrow/tests/test_substrait.py 
b/python/pyarrow/tests/test_substrait.py
index 98c206fd7e..f05d68a95a 100644
--- a/python/pyarrow/tests/test_substrait.py
+++ b/python/pyarrow/tests/test_substrait.py
@@ -33,6 +33,13 @@ except ImportError:
 pytestmark = [pytest.mark.dataset, pytest.mark.substrait]
 
 
+def _write_dummy_data_to_disk(tmpdir, file_name, table):
+    path = os.path.join(str(tmpdir), file_name)
+    with pa.ipc.RecordBatchFileWriter(path, schema=table.schema) as writer:
+        writer.write_table(table)
+    return path
+
+
 @pytest.mark.skipif(sys.platform == 'win32',
                     reason="ARROW-16392: file based URI is" +
                     " not fully supported for Windows")
@@ -65,12 +72,10 @@ def test_run_serialized_query(tmpdir):
         ]
     }
     """
-    # TODO: replace with ipc when the support is finalized in C++
-    path = os.path.join(str(tmpdir), 'substrait_data.arrow')
-    table = pa.table([[1, 2, 3, 4, 5]], names=['foo'])
-    with pa.ipc.RecordBatchFileWriter(path, schema=table.schema) as writer:
-        writer.write_table(table)
 
+    file_name = "read_data.arrow"
+    table = pa.table([[1, 2, 3, 4, 5]], names=['foo'])
+    path = _write_dummy_data_to_disk(tmpdir, file_name, table)
     query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER", path))
 
     buf = pa._substrait._parse_json_plan(query)
@@ -92,3 +97,51 @@ def test_invalid_plan():
     exec_message = "Empty substrait plan is passed."
     with pytest.raises(ArrowInvalid, match=exec_message):
         substrait.run_query(buf)
+
+
[email protected](sys.platform == 'win32',
+                    reason="ARROW-16392: file based URI is" +
+                    " not fully supported for Windows")
+def test_binary_conversion_with_json_options(tmpdir):
+    substrait_query = """
+    {
+        "relations": [
+        {"rel": {
+            "read": {
+            "base_schema": {
+                "struct": {
+                "types": [
+                            {"i64": {}}
+                        ]
+                },
+                "names": [
+                        "bar"
+                        ]
+            },
+            "local_files": {
+                "items": [
+                {
+                    "uri_file": "file://FILENAME_PLACEHOLDER",
+                    "arrow": {},
+                    "metadata" : {
+                      "created_by" : {},
+                    }
+                }
+                ]
+            }
+            }
+        }}
+        ]
+    }
+    """
+
+    file_name = "binary_json_data.arrow"
+    table = pa.table([[1, 2, 3, 4, 5]], names=['bar'])
+    path = _write_dummy_data_to_disk(tmpdir, file_name, table)
+    query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER", path))
+    buf = pa._substrait._parse_json_plan(tobytes(query))
+
+    reader = substrait.run_query(buf)
+    res_tb = reader.read_all()
+
+    assert table.select(["bar"]) == res_tb.select(["bar"])

[arrow] branch master updated: ARROW-17066: [C++][Python][Substrait] "ignore_unknown_fields" should be specified when converting JSON to binary (#13605)

Reply via email to