This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-python.git


The following commit(s) were added to refs/heads/main by this push:
     new d3d4268  Add support for cudf as a physical execution engine (#205)
d3d4268 is described below

commit d3d42685255246f93933f3abe82301f3db18a52d
Author: Jeremy Dyer <[email protected]>
AuthorDate: Tue Feb 21 20:50:07 2023 -0500

    Add support for cudf as a physical execution engine (#205)
---
 Cargo.lock                                         | 20 +++----
 conda/environments/datafusion-dev.yaml             |  5 +-
 datafusion/cudf.py                                 | 62 ++++++++++++++++++++++
 .../datafusion-dev.yaml => examples/sql-on-cudf.py | 35 ++++--------
 4 files changed, 86 insertions(+), 36 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5059afa..04a2ea8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1134,9 +1134,9 @@ dependencies = [
 
 [[package]]
 name = "http"
-version = "0.2.8"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index";
-checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399"
+checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482"
 dependencies = [
  "bytes",
  "fnv",
@@ -1385,9 +1385,9 @@ checksum = 
"201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
 
 [[package]]
 name = "libflate"
-version = "1.2.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index";
-checksum = "05605ab2bce11bcfc0e9c635ff29ef8b2ea83f29be257ee7d730cac3ee373093"
+checksum = "97822bf791bd4d5b403713886a5fbe8bf49520fe78e323b0dc480ca1a03e50b0"
 dependencies = [
  "adler32",
  "crc32fast",
@@ -1396,9 +1396,9 @@ dependencies = [
 
 [[package]]
 name = "libflate_lz77"
-version = "1.1.0"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index";
-checksum = "39a734c0493409afcd49deee13c006a04e3586b9761a03543c6272c9c51f2f5a"
+checksum = "a52d3a8bfc85f250440e4424db7d857e241a3aebbbe301f3eb606ab15c39acbf"
 dependencies = [
  "rle-decode-fast",
 ]
@@ -2359,9 +2359,9 @@ dependencies = [
 
 [[package]]
 name = "slab"
-version = "0.4.7"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index";
-checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef"
+checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d"
 dependencies = [
  "autocfg",
 ]
@@ -2635,9 +2635,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-stream"
-version = "0.1.11"
+version = "0.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index";
-checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce"
+checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313"
 dependencies = [
  "futures-core",
  "pin-project-lite",
diff --git a/conda/environments/datafusion-dev.yaml 
b/conda/environments/datafusion-dev.yaml
index 0e17e16..d9405e4 100644
--- a/conda/environments/datafusion-dev.yaml
+++ b/conda/environments/datafusion-dev.yaml
@@ -28,7 +28,7 @@ dependencies:
 - pytest
 - toml
 - importlib_metadata
-- python>=3.7,<3.11
+- python>=3.10
 # Packages useful for building distributions and releasing
 - mamba
 - conda-build
@@ -38,4 +38,7 @@ dependencies:
 - pydata-sphinx-theme==0.8.0
 - myst-parser
 - jinja2
+# GPU packages
+- cudf
+- cudatoolkit=11.8
 name: datafusion-dev
diff --git a/datafusion/cudf.py b/datafusion/cudf.py
new file mode 100644
index 0000000..c38819c
--- /dev/null
+++ b/datafusion/cudf.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import cudf
+import datafusion
+from datafusion.expr import Projection, TableScan, Column
+
+
+class SessionContext:
+    def __init__(self):
+        self.datafusion_ctx = datafusion.SessionContext()
+        self.parquet_tables = {}
+
+    def register_parquet(self, name, path):
+        self.parquet_tables[name] = path
+        self.datafusion_ctx.register_parquet(name, path)
+
+    def to_cudf_expr(self, expr):
+
+        # get Python wrapper for logical expression
+        expr = expr.to_variant()
+
+        if isinstance(expr, Column):
+            return expr.name()
+        else:
+            raise Exception("unsupported expression: {}".format(expr))
+
+    def to_cudf_df(self, plan):
+        # recurse down first to translate inputs into pandas data frames
+        inputs = [self.to_cudf_df(x) for x in plan.inputs()]
+
+        # get Python wrapper for logical operator node
+        node = plan.to_variant()
+
+        if isinstance(node, Projection):
+            args = [self.to_cudf_expr(expr) for expr in node.projections()]
+            return inputs[0][args]
+        elif isinstance(node, TableScan):
+            return cudf.read_parquet(self.parquet_tables[node.table_name()])
+        else:
+            raise Exception(
+                "unsupported logical operator: {}".format(type(node))
+            )
+
+    def sql(self, sql):
+        datafusion_df = self.datafusion_ctx.sql(sql)
+        plan = datafusion_df.logical_plan()
+        return self.to_cudf_df(plan)
diff --git a/conda/environments/datafusion-dev.yaml b/examples/sql-on-cudf.py
similarity index 63%
copy from conda/environments/datafusion-dev.yaml
copy to examples/sql-on-cudf.py
index 0e17e16..407cb1f 100644
--- a/conda/environments/datafusion-dev.yaml
+++ b/examples/sql-on-cudf.py
@@ -6,7 +6,7 @@
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
-#   http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
@@ -15,27 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
-channels:
-- conda-forge
-dependencies:
-- black
-- flake8
-- isort
-- maturin
-- mypy
-- numpy
-- pyarrow
-- pytest
-- toml
-- importlib_metadata
-- python>=3.7,<3.11
-# Packages useful for building distributions and releasing
-- mamba
-- conda-build
-- anaconda-client
-# Packages for documentation building
-- sphinx
-- pydata-sphinx-theme==0.8.0
-- myst-parser
-- jinja2
-name: datafusion-dev
+from datafusion.cudf import SessionContext
+
+
+ctx = SessionContext()
+ctx.register_parquet(
+    "taxi", "/home/jeremy/Downloads/yellow_tripdata_2021-01.parquet"
+)
+df = ctx.sql("select passenger_count from taxi")
+print(df)

Reply via email to