This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch python
in repository https://gitbox.apache.org/repos/asf/datasketches-spark.git


The following commit(s) were added to refs/heads/python by this push:
     new a9144fe  finish setting up config to support pyspark package, 
inclduing methods to simplify use
a9144fe is described below

commit a9144fe09f4d262d6da114ec8ce28c5ea3c8bcd7
Author: Jon Malkin <[email protected]>
AuthorDate: Wed Feb 12 20:28:13 2025 -0800

    finish setting up config to support pyspark package, inclduing methods to 
simplify use
---
 project/BuildUtils.scala                |  3 +-
 python/MANIFEST.in                      |  5 +--
 python/pyproject.toml                   | 10 +----
 python/setup.py                         | 78 ++++++++++++---------------------
 python/src/datasketches_spark/common.py | 47 +++++++++++++-------
 version.cfg => python/tests/__init__.py |  2 -
 python/tests/kll_test.py                | 43 ++++++++++++++++++
 python/{MANIFEST.in => tox.ini}         | 11 +++--
 version.cfg                             |  5 ++-
 9 files changed, 118 insertions(+), 86 deletions(-)

diff --git a/project/BuildUtils.scala b/project/BuildUtils.scala
index e0dd64a..a9688a9 100644
--- a/project/BuildUtils.scala
+++ b/project/BuildUtils.scala
@@ -33,8 +33,7 @@ val jvmVersionMap = Map(
 )
 
 // TODO: any way to avoid hardcoding this?
-//val pythonVersionFileName = "python/src/datasketches_spark/version.py"
-val pythonVersionFileName = "python/version.txt"
+val pythonVersionFileName = "python/src/datasketches_spark/version.txt"
 
 // reads the version file, reformats as needed for python, and stores
 // in the python subdirectory as the __version__ function for the package
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
index 1ef57bb..a5e9235 100644
--- a/python/MANIFEST.in
+++ b/python/MANIFEST.in
@@ -15,8 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-include version.txt
-
 graft src
 graft tests
-graft src/datasketches_spark/deps/*
\ No newline at end of file
+
+include src/datasketches_spark/deps/*
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 648773d..5f41e9e 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -21,7 +21,6 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "datasketches_spark"
-#version = { file = "version.txt" }
 dynamic = ["version"]
 description = "The Apache DataSketches Library for Python"
 authors = [
@@ -38,15 +37,10 @@ dependencies = [
 package-dir = { "" = "src" }
 
 [tool.setuptools.dynamic]
-version = { file = "version.txt" }
+version = { file = "src/datasketches_spark/version.txt" }
 
 [tool.setuptools.package-data]
-datasketches_spark = ["deps/*"]
-
-[tools.setuptools.find]
-where = ["src"]
-include = ["datasketches_spark"]
-exclude = ["datasketches_spark.deps", "datasketches_spark.tests"]
+datasketches_spark = ["version.txt", "deps/*"]
 
 [tool.cibuildwheel]
 build-verbosity = 0  # options: 1, 2, or 3
diff --git a/python/setup.py b/python/setup.py
index 5a1c670..57a73f0 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -40,58 +40,38 @@ SCALA_VERSION environment variable.
 Then return to this diretory and resume building your sdist or wheel.
 """
 
-# TODO: for tox, check if files exist in DEPTS_PATH
-# can copy if /target is newer or dept does not have files
+def check_or_copy_files(filename_pattern: str, src: str, dst: str) -> None:
+    """
+    Checks if file(s) exist(s) in dst, updating if src has a newer version
+    """
 
-# Find the datasketches-spark jar path -- other dependencies handled separately
-DS_SPARK_JAR_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/scala-*/"))
-if len(DS_SPARK_JAR_PATH) == 0:
-    print(missing_jars_message, file=sys.stderr)
-    sys.exit(-1)
+    # create list in src, check if files exist in dst
+    # copy if src has a newer version
+    src_list = glob.glob(os.path.join(src, filename_pattern))
+    if (len(src_list) > 0):
+        for src_file in src_list:
+            dst_file = os.path.join(dst, os.path.basename(src_file))
+            if os.path.exists(dst_file):
+                if os.path.getmtime(src_file) > os.path.getmtime(dst_file):
+                    copyfile(src_file, dst_file)
+            else:
+                copyfile(src_file, dst_file)
 
-# Find the datasketches-java and datasketches-memory dependency jar path
-DS_JAVA_LIB_PATH = glob.glob(os.path.join(DS_SPARK_HOME, "target/lib/"))
-if len(DS_JAVA_LIB_PATH) == 1:
-    DS_JAVA_LIB_PATH = DS_JAVA_LIB_PATH[0]
-else: # error if something other than 1 directory found
-    print(missing_jars_message, file=sys.stderr)
-    sys.exit(-1)
+    # copying done, if necessary, so check if file exists in dst
+    dst_file = glob.glob(os.path.join(dst, filename_pattern))
+    if (len(dst_file) == 0):
+        print(missing_jars_message, file=sys.stderr)
+        sys.exit(-1)
 
-# Copy the jars to the temporary directory
-# Future possible enhancement: symlink instead of copy
-try:
-    os.makedirs(DEPS_PATH)
-except OSError:
-    # we don't care if it already exists
-    pass
 
-# Copy the relevant jar files to temp path
-for path in DS_SPARK_JAR_PATH:
-    for jar_file in glob.glob(os.path.join(path, f"datasketches-spark_*.jar")):
-        copyfile(jar_file, os.path.join(DEPS_PATH, os.path.basename(jar_file)))
+# Find the datasketches-spark jar path -- other dependencies handled separately
+sbt_scala_dir = os.path.join(DS_SPARK_HOME, "target", "scala-*")
+check_or_copy_files("datasketches-spark_*.jar", sbt_scala_dir, DEPS_PATH)
 
-# copy any ds-java and ds-memory jars, and dependencies.txt, too
-for jar_file in glob.glob(os.path.join(DS_JAVA_LIB_PATH, 
f"datasketches-java-*.jar")):
-    copyfile(jar_file, os.path.join(DEPS_PATH, os.path.basename(jar_file)))
-for jar_file in glob.glob(os.path.join(DS_JAVA_LIB_PATH, 
f"datasketches-memory-*.jar")):
-    copyfile(jar_file, os.path.join(DEPS_PATH, os.path.basename(jar_file)))
-for jar_file in glob.glob(os.path.join(DS_JAVA_LIB_PATH, f"dependencies.txt")):
-    copyfile(jar_file, os.path.join(DEPS_PATH, os.path.basename(jar_file)))
+# Find the datasketches-java and datasketches-memory dependency jar path
+sbt_lib_dir = os.path.join(DS_SPARK_HOME, "target", "lib")
+check_or_copy_files("datasketches-java-*.jar", sbt_lib_dir, DEPS_PATH)
+check_or_copy_files("datasketches-memory-*.jar", sbt_lib_dir, DEPS_PATH)
+check_or_copy_files("dependencies.txt", sbt_lib_dir, DEPS_PATH)
 
-setup(
-    #version = VERSION
-    # name='datasketches_spark',
-    # author='Apache Software Foundation',
-    # author_email='[email protected]',
-    # description='The Apache DataSketches Library for Python',
-    # license='Apache License 2.0',
-    # url='http://datasketches.apache.org',
-    # long_description=open('README.md').read(),
-    # long_description_content_type='text/markdown',
-    # include_package_data=True,
-    # package_dir={'':'src'},
-    # packages=find_packages(where='src'),
-    # install_requires=['pyspark'],
-    # python_requires='>=3.8',
-    # zip_safe=False
-)
+setup()
diff --git a/python/src/datasketches_spark/common.py 
b/python/src/datasketches_spark/common.py
index b812b40..d325186 100644
--- a/python/src/datasketches_spark/common.py
+++ b/python/src/datasketches_spark/common.py
@@ -20,31 +20,44 @@ from pyspark.sql.column import Column, _to_java_column, 
_to_seq, _create_column_
 from py4j.java_gateway import JavaClass
 from typing import Any, TypeVar, Union, Callable
 from functools import lru_cache
+from ._version import __version__
 
 import os
-import pkg_resources
+import importlib.resources
 
 ColumnOrName = Union[Column, str]
 ColumnOrName_ = TypeVar("ColumnOrName_", bound=ColumnOrName)
 
-def get_jar_paths(*jar_names: str) -> list[str]:
+def get_dependency_path(filename: str) -> str:
     """
-    Returns a list of absolute paths to the provided jars,\n
-    assuming they are included in the package.
-    :param jar_names: Names of jars to retrieve
-    :return: List of absolute paths to jars
+    Returns a list of absolute paths to the specified file,\n
+    it is included in the package's /deps subdir.
+    :param filename: Name of file to retrieve
+    :return: Absolute paths to filename
+    :exception FileNotFoundError: If a file is not found in the package
     """
-    jar_paths = []
-    for jar_name in jar_names:
-        try:
-            jar_path = pkg_resources.resource_filename(__name__, 
f"deps/{jar_name}")
-            if os.path.exists(jar_path):
-                jar_paths.append(jar_path)
-            else:
-                raise FileNotFoundError(f"Jar {jar_name} not found in package")
-        except ValueError:
-            raise FileNotFoundError(f"Jar {jar_name} not found in package")
-    return jar_paths
+    try:
+        with importlib.resources.path("datasketches_spark.deps", filename) as 
file_path:
+            return str(file_path)
+    except FileNotFoundError:
+        raise FileNotFoundError(f"File {filename} not found in 
datasketches_spark.deps")
+
+def get_dependency_classpath() -> list[str]:
+    """
+    Returns a list of absolute paths to the jar files included in the 
package.\n
+    Assumes that the jar files are located in the package's /deps subdir.
+    """
+
+    # we need whatever is listed in dependencies.txt as well as
+    # datasketches-spark_<scala_veersion>-<ds-spark_version>.jar
+    jar_files = []
+    with importlib.resources.open_text("datasketches_spark.deps", 
"dependencies.txt") as dependencies:
+        for dep in dependencies:
+            jar_files.append(dep.strip())
+    ds_spark_version = __version__
+    jar_files.append(f"datasketches-spark_{os.environ.get('SCALA_VERSION', 
'2.12')}-{ds_spark_version}.jar")
+
+    return ":".join([get_dependency_path(jar) for jar in jar_files])
 
 
 # Since we have functions from different packages, rather than the
diff --git a/version.cfg b/python/tests/__init__.py
similarity index 98%
copy from version.cfg
copy to python/tests/__init__.py
index 2a56927..13a8339 100644
--- a/version.cfg
+++ b/python/tests/__init__.py
@@ -14,5 +14,3 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-0.1.0-SNAPSHOT
diff --git a/python/tests/kll_test.py b/python/tests/kll_test.py
new file mode 100644
index 0000000..c142049
--- /dev/null
+++ b/python/tests/kll_test.py
@@ -0,0 +1,43 @@
+
+import unittest
+from pyspark.sql.types import StructType, StructField, DoubleType
+from pyspark.sql.session import SparkSession
+
+from datasketches_spark import get_dependency_classpath
+from datasketches_spark.kll import *
+
+class PySparkBase(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.spark = (
+          SparkSession.builder
+            .appName("test")
+            .master("local[1]")
+            .config("spark.driver.userClassPathFirst", "true")
+            .config("spark.executor.userClassPathFirst", "true")
+            .config("spark.driver.extraClassPath", get_dependency_classpath())
+            .config("spark.executor.extraClassPath", 
get_dependency_classpath())
+            .getOrCreate()
+        )
+
+  @classmethod
+  def tearDownClass(cls):
+    cls.spark.stop()
+
+class TestKll(PySparkBase):
+  def test_kll(self):
+    spark = self.spark
+
+    # Create a DataFrame
+    n = 100000
+    data = [(float(i),) for i in range(1, n + 1)]
+    schema = StructType([StructField("value", DoubleType(), True)])
+    df = spark.createDataFrame(data, schema)
+    df_agg = df.agg(kll_sketch_double_agg_build("value", 160).alias("sketch"))
+    df_agg.show()
+
+    df_agg.select(
+      kll_sketch_double_get_min("sketch").alias("min"),
+      kll_sketch_double_get_max("sketch").alias("max"),
+      kll_sketch_double_get_pmf("sketch", [25000, 30000, 75000]).alias("pmf")
+    ).show()
diff --git a/python/MANIFEST.in b/python/tox.ini
similarity index 88%
copy from python/MANIFEST.in
copy to python/tox.ini
index 1ef57bb..febc6cd 100644
--- a/python/MANIFEST.in
+++ b/python/tox.ini
@@ -15,8 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-include version.txt
+[tox]
+envlist = py3
+isolated_build = true
 
-graft src
-graft tests
-graft src/datasketches_spark/deps/*
\ No newline at end of file
+[testenv]
+deps = pytest
+changedir = tests
+commands = pytest
diff --git a/version.cfg b/version.cfg
index 2a56927..a480287 100644
--- a/version.cfg
+++ b/version.cfg
@@ -15,4 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-0.1.0-SNAPSHOT
+# Using python-compatible naming to simplify python packaging
+# when using for non-release branches. This intentionally
+# differs from the usual -SNAPSHOT suffix on our jars
+0.1.0.dev0


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to