[GitHub] [arrow] vibhatha commented on a diff in pull request #14024: ARROW-17521: [Python] Add python bindings for NamedTableProvider for Substrait consumer

GitBox Tue, 13 Sep 2022 17:21:24 -0700


vibhatha commented on code in PR #14024:
URL: https://github.com/apache/arrow/pull/14024#discussion_r970186454



##########
python/pyarrow/_substrait.pyx:
##########
@@ -17,22 +17,98 @@
 
 # cython: language_level = 3
 from cython.operator cimport dereference as deref
+from libcpp.vector cimport vector as std_vector
 
 from pyarrow import Buffer
-from pyarrow.lib import frombytes
+from pyarrow.lib import frombytes, tobytes
 from pyarrow.lib cimport *
 from pyarrow.includes.libarrow cimport *
 from pyarrow.includes.libarrow_substrait cimport *
 
 
-def run_query(plan):
+cdef CDeclaration _create_named_table_provider(dict named_args, const 
std_vector[c_string]& names):
+    cdef:
+        c_string c_name
+        shared_ptr[CTable] c_in_table
+        shared_ptr[CTableSourceNodeOptions] c_tablesourceopts
+        shared_ptr[CExecNodeOptions] c_input_node_opts
+        vector[CDeclaration.Input] no_c_inputs
+
+    py_names = []
+    for i in range(names.size()):
+        c_name = names[i]
+        py_names.append(frombytes(c_name))
+
+    py_table = named_args["provider"](py_names)
+    c_in_table = pyarrow_unwrap_table(py_table)
+    c_tablesourceopts = make_shared[CTableSourceNodeOptions](c_in_table)
+    c_input_node_opts = static_pointer_cast[CExecNodeOptions, 
CTableSourceNodeOptions](
+        c_tablesourceopts)
+    return CDeclaration(tobytes("table_source"),
+                        no_c_inputs, c_input_node_opts)
+
+
+def run_query(plan, table_provider=None):
     """
     Execute a Substrait plan and read the results as a RecordBatchReader.
 
     Parameters
     ----------
     plan : Buffer
         The serialized Substrait plan to execute.
+    table_provider : object (optional)
+        A function to resolve any NamedTable relation to a table.
+        The function will receive a single argument which will be a list
+        of strings representing the table name and should return a 
pyarrow.Table.
+
+    Returns
+    -------
+    RecordBatchReader
+        A reader containing the result of the executed query
+
+    Examples
+    --------
+    >>> import pyarrow as pa
+    >>> from pyarrow.lib import tobytes
+    >>> import pyarrow.substrait as substrait
+    >>> test_table_1 = pa.Table.from_pydict({"x": [1, 2, 3]})
+    >>> test_table_2 = pa.Table.from_pydict({"x": [4, 5, 6]})
+    >>> def table_provider(names):
+    ...     if not names:
+    ...        raise Exception("No names provided")
+    ...     elif names[0] == "t1":
+    ...        return test_table_1
+    ...     elif names[1] == "t2":
+    ...        return test_table_2
+    ...     else:
+    ...        raise Exception("Unrecognized table name")
+    ... 
+    >>> substrait_query = '''
+    ...         {
+    ...             "relations": [
+    ...             {"rel": {
+    ...                 "read": {
+    ...                 "base_schema": {
+    ...                     "struct": {
+    ...                     "types": [
+    ...                                 {"i64": {}}
+    ...                             ]
+    ...                     },
+    ...                     "names": [
+    ...                             "x"
+    ...                             ]
+    ...                 },
+    ...                 "namedTable": {
+    ...                         "names": ["t1"]
+    ...                 }
+    ...                 }
+    ...             }}
+    ...             ]
+    ...         }
+    ... '''
+    >>> buf = pa._substrait._parse_json_plan(tobytes(substrait_query))
+    >>> pa.substrait.run_query_with_provider(buf, table_provider)
+    <pyarrow.lib.RecordBatchReader object at 0x100dcaf10>

Review Comment:
   I see... let me come up with a better solution.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] vibhatha commented on a diff in pull request #14024: ARROW-17521: [Python] Add python bindings for NamedTableProvider for Substrait consumer

Reply via email to