This is an automated email from the ASF dual-hosted git repository.
xushiyan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new e7e7f3c1641e fix: update utils.py and notebook (#14080)
e7e7f3c1641e is described below
commit e7e7f3c1641e948b47ac737ea0a1e7f57d51c4dd
Author: deepakpanda93 <[email protected]>
AuthorDate: Wed Oct 15 21:25:51 2025 +0530
fix: update utils.py and notebook (#14080)
---
hudi-notebooks/build.sh | 10 +++++-----
hudi-notebooks/notebooks/02-query-types.ipynb | 6 +++---
hudi-notebooks/notebooks/utils.py | 23 +++++++++++++++--------
3 files changed, 23 insertions(+), 16 deletions(-)
diff --git a/hudi-notebooks/build.sh b/hudi-notebooks/build.sh
index adfd493cd220..83847f62c29e 100644
--- a/hudi-notebooks/build.sh
+++ b/hudi-notebooks/build.sh
@@ -17,11 +17,11 @@
set -eux
-export HUDI_VERSION=${HUDI_VERSION:-1.0.2}
+export HUDI_VERSION=1.0.2
export HUDI_VERSION_TAG=${HUDI_VERSION}
-export SPARK_VERSION=${SPARK_VERSION:-3.5.7}
-export HIVE_VERSION=${HIVE_VERSION:-3.1.3}
-export HIVE_VERSION_TAG=${HIVE_VERSION_TAG:-3.1.3}
+export SPARK_VERSION=3.5.7
+export HIVE_VERSION=3.1.3
+export HIVE_VERSION_TAG=${HIVE_VERSION}
SCRIPT_DIR=$(cd $(dirname $0); pwd)
@@ -40,4 +40,4 @@ docker build \
--build-arg HIVE_VERSION="$HIVE_VERSION" \
-t apachehudi/hive:latest \
-t apachehudi/hive:"$HIVE_VERSION_TAG" \
- -f "$SCRIPT_DIR"/Dockerfile.hive .
\ No newline at end of file
+ -f "$SCRIPT_DIR"/Dockerfile.hive .
diff --git a/hudi-notebooks/notebooks/02-query-types.ipynb
b/hudi-notebooks/notebooks/02-query-types.ipynb
index 18eaa164b267..f588760479a3 100644
--- a/hudi-notebooks/notebooks/02-query-types.ipynb
+++ b/hudi-notebooks/notebooks/02-query-types.ipynb
@@ -148,7 +148,7 @@
"outputs": [],
"source": [
"table_name_cow = \"trips_table_cow\"\n",
- "base_path = f\"s3a://warehouse/hudi-query-types/{table_name_cow}\"\n",
+ "base_path = f\"s3a://warehouse/hudi-query-types\"\n",
"\n",
"cow_hudi_conf = {\n",
" \"hoodie.table.name\": table_name_cow, # The name of our Hudi
table.\n",
@@ -395,7 +395,7 @@
"source": [
"spark.read.format(\"hudi\") \\\n",
" .option(\"as.of.instant\", beginTime) \\\n",
- " .load(base_path).createOrReplaceTempView(\"trips_time_travel\")"
+ "
.load(f\"{base_path}/{table_name_cow}\").createOrReplaceTempView(\"trips_time_travel\")"
]
},
{
@@ -565,7 +565,7 @@
"outputs": [],
"source": [
"table_name_mor = \"trips_table_mor\"\n",
- "base_path = f\"s3a://warehouse/hudi-query-types/{table_name_mor}\"\n",
+ "base_path = f\"s3a://warehouse/hudi-query-types\"\n",
"\n",
"mor_hudi_conf = {\n",
" \"hoodie.table.name\": table_name_mor,\n",
diff --git a/hudi-notebooks/notebooks/utils.py
b/hudi-notebooks/notebooks/utils.py
index bfad98259be0..1a888ffe3ab0 100644
--- a/hudi-notebooks/notebooks/utils.py
+++ b/hudi-notebooks/notebooks/utils.py
@@ -32,6 +32,7 @@ def get_spark_session(app_name="Hudi-Notebooks"):
spark_session = SparkSession.builder \
.appName(app_name) \
+ .config("spark.hadoop.fs.defaultFS", "s3a://warehouse") \
.enableHiveSupport() \
.getOrCreate()
@@ -40,6 +41,9 @@ def get_spark_session(app_name="Hudi-Notebooks"):
return spark_session
+# Initialize Spark globally so other functions can use it
+spark = get_spark_session()
+
# S3 Utility Function
def ls(base_path):
"""
@@ -50,13 +54,16 @@ def ls(base_path):
if not base_path.startswith("s3a://"):
raise ValueError("Path must start with 's3a://'")
try:
- parsed = urlparse(base_path)
- bucket_name = parsed.netloc
- prefix = parsed.path.lstrip("/")
- s3_client = boto3.client('s3')
- response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
- for obj in response['Contents']:
- print(f"s3a://{bucket_name}/{obj['Key']}")
+ hadoop_conf = spark._jsc.hadoopConfiguration()
+ fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)
+ p = spark._jvm.org.apache.hadoop.fs.Path(base_path)
+ if not fs.exists(p):
+ print(f"Path does not exist: {base_path}")
+ return []
+ status = fs.listStatus(p)
+ files = [str(file.getPath()) for file in status]
+ for f in files:
+ print(f)
except Exception as e:
print(f"Exception occurred while listing files from path {base_path}",
e)
@@ -120,4 +127,4 @@ def display(df, num_rows=100):
"""
# Display the final HTML
- display_html(HTML(custom_css + html_table))
\ No newline at end of file
+ display_html(HTML(custom_css + html_table))