(hudi) branch master updated: fix: update utils.py and notebook (#14080)

xushiyan Sat, 18 Oct 2025 01:47:32 -0700

This is an automated email from the ASF dual-hosted git repository.

xushiyan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git



The following commit(s) were added to refs/heads/master by this push:
     new e7e7f3c1641e fix: update utils.py and notebook (#14080)
e7e7f3c1641e is described below

commit e7e7f3c1641e948b47ac737ea0a1e7f57d51c4dd
Author: deepakpanda93 <[email protected]>
AuthorDate: Wed Oct 15 21:25:51 2025 +0530

    fix: update utils.py and notebook (#14080)
---
 hudi-notebooks/build.sh                       | 10 +++++-----
 hudi-notebooks/notebooks/02-query-types.ipynb |  6 +++---
 hudi-notebooks/notebooks/utils.py             | 23 +++++++++++++++--------
 3 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/hudi-notebooks/build.sh b/hudi-notebooks/build.sh
index adfd493cd220..83847f62c29e 100644
--- a/hudi-notebooks/build.sh
+++ b/hudi-notebooks/build.sh
@@ -17,11 +17,11 @@
 
 set -eux
 
-export HUDI_VERSION=${HUDI_VERSION:-1.0.2}
+export HUDI_VERSION=1.0.2
 export HUDI_VERSION_TAG=${HUDI_VERSION}
-export SPARK_VERSION=${SPARK_VERSION:-3.5.7}
-export HIVE_VERSION=${HIVE_VERSION:-3.1.3}
-export HIVE_VERSION_TAG=${HIVE_VERSION_TAG:-3.1.3}
+export SPARK_VERSION=3.5.7
+export HIVE_VERSION=3.1.3
+export HIVE_VERSION_TAG=${HIVE_VERSION}
 
 SCRIPT_DIR=$(cd $(dirname $0); pwd)
 
@@ -40,4 +40,4 @@ docker build \
     --build-arg HIVE_VERSION="$HIVE_VERSION" \
     -t apachehudi/hive:latest \
     -t apachehudi/hive:"$HIVE_VERSION_TAG" \
-    -f "$SCRIPT_DIR"/Dockerfile.hive .
\ No newline at end of file
+    -f "$SCRIPT_DIR"/Dockerfile.hive .
diff --git a/hudi-notebooks/notebooks/02-query-types.ipynb 
b/hudi-notebooks/notebooks/02-query-types.ipynb
index 18eaa164b267..f588760479a3 100644
--- a/hudi-notebooks/notebooks/02-query-types.ipynb
+++ b/hudi-notebooks/notebooks/02-query-types.ipynb
@@ -148,7 +148,7 @@
    "outputs": [],
    "source": [
     "table_name_cow = \"trips_table_cow\"\n",
-    "base_path = f\"s3a://warehouse/hudi-query-types/{table_name_cow}\"\n",
+    "base_path = f\"s3a://warehouse/hudi-query-types\"\n",
     "\n",
     "cow_hudi_conf = {\n",
     "    \"hoodie.table.name\": table_name_cow, # The name of our Hudi 
table.\n",
@@ -395,7 +395,7 @@
    "source": [
     "spark.read.format(\"hudi\") \\\n",
     "  .option(\"as.of.instant\", beginTime) \\\n",
-    "  .load(base_path).createOrReplaceTempView(\"trips_time_travel\")"
+    "  
.load(f\"{base_path}/{table_name_cow}\").createOrReplaceTempView(\"trips_time_travel\")"
    ]
   },
   {
@@ -565,7 +565,7 @@
    "outputs": [],
    "source": [
     "table_name_mor = \"trips_table_mor\"\n",
-    "base_path = f\"s3a://warehouse/hudi-query-types/{table_name_mor}\"\n",
+    "base_path = f\"s3a://warehouse/hudi-query-types\"\n",
     "\n",
     "mor_hudi_conf = {\n",
     "    \"hoodie.table.name\": table_name_mor,\n",
diff --git a/hudi-notebooks/notebooks/utils.py 
b/hudi-notebooks/notebooks/utils.py
index bfad98259be0..1a888ffe3ab0 100644
--- a/hudi-notebooks/notebooks/utils.py
+++ b/hudi-notebooks/notebooks/utils.py
@@ -32,6 +32,7 @@ def get_spark_session(app_name="Hudi-Notebooks"):
     
     spark_session = SparkSession.builder \
         .appName(app_name) \
+           .config("spark.hadoop.fs.defaultFS", "s3a://warehouse") \
         .enableHiveSupport() \
         .getOrCreate()
         
@@ -40,6 +41,9 @@ def get_spark_session(app_name="Hudi-Notebooks"):
     
     return spark_session
 
+# Initialize Spark globally so other functions can use it
+spark = get_spark_session()
+
 # S3 Utility Function
 def ls(base_path):
     """
@@ -50,13 +54,16 @@ def ls(base_path):
     if not base_path.startswith("s3a://"):
         raise ValueError("Path must start with 's3a://'")
     try:
-        parsed = urlparse(base_path)
-        bucket_name = parsed.netloc
-        prefix = parsed.path.lstrip("/")
-        s3_client = boto3.client('s3')
-        response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
-        for obj in response['Contents']:
-            print(f"s3a://{bucket_name}/{obj['Key']}")
+           hadoop_conf = spark._jsc.hadoopConfiguration()
+        fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)
+        p = spark._jvm.org.apache.hadoop.fs.Path(base_path)
+        if not fs.exists(p):
+            print(f"Path does not exist: {base_path}")
+            return []
+        status = fs.listStatus(p)
+        files = [str(file.getPath()) for file in status]
+        for f in files:
+            print(f)
     except Exception as e:
         print(f"Exception occurred while listing files from path {base_path}", 
e)
 
@@ -120,4 +127,4 @@ def display(df, num_rows=100):
     """
     
     # Display the final HTML
-    display_html(HTML(custom_css + html_table))
\ No newline at end of file
+    display_html(HTML(custom_css + html_table))

(hudi) branch master updated: fix: update utils.py and notebook (#14080)

Reply via email to