gh-yzou commented on code in PR #1862: URL: https://github.com/apache/polaris/pull/1862#discussion_r2178168915
########## plugins/spark/v3.5/integration/build.gradle.kts: ########## @@ -60,12 +60,51 @@ dependencies { exclude("org.apache.logging.log4j", "log4j-core") exclude("org.slf4j", "jul-to-slf4j") } + + // Add spark-hive for Hudi integration - provides HiveExternalCatalog that Hudi needs + testImplementation("org.apache.spark:spark-hive_${scalaVersion}:${spark35Version}") { + // exclude log4j dependencies to match spark-sql exclusions Review Comment: can we remove the spark_sql dependency above? ########## plugins/spark/v3.5/integration/src/intTest/resources/logback.xml: ########## @@ -32,6 +32,9 @@ out the configuration if you would like ot see all spark debug log during the ru </encoder> </appender> + <!-- Hudi-specific loggers for test --> + <logger name="org.apache.hudi" level="INFO"/> Review Comment: does hudi output a log of logs? too much logging was causing problems to the test efficiency, so we only turned on the error log here. How long does the integration test take now? ########## plugins/spark/v3.5/spark/build.gradle.kts: ########## @@ -46,6 +46,47 @@ dependencies { // TODO: extract a polaris-rest module as a thin layer for // client to depends on. implementation(project(":polaris-core")) { isTransitive = false } + implementation(project(":polaris-api-iceberg-service")) { Review Comment: why does hudi need those dependency ? ########## plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/utils/PolarisCatalogUtils.java: ########## @@ -87,6 +119,32 @@ public static Table loadSparkTable(GenericTable genericTable) { provider, new CaseInsensitiveStringMap(tableProperties), scala.Option.empty()); } + public static Table loadHudiSparkTable(GenericTable genericTable, Identifier identifier) { + SparkSession sparkSession = SparkSession.active(); + Map<String, String> tableProperties = Maps.newHashMap(); + tableProperties.putAll(genericTable.getProperties()); + tableProperties.put( + TABLE_PATH_KEY, genericTable.getProperties().get(TableCatalog.PROP_LOCATION)); + String namespacePath = String.join(".", identifier.namespace()); + TableIdentifier tableIdentifier = + new TableIdentifier(identifier.name(), Option.apply(namespacePath)); + CatalogTable catalogTable = null; + try { + catalogTable = sparkSession.sessionState().catalog().getTableMetadata(tableIdentifier); + } catch (NoSuchDatabaseException e) { + throw new RuntimeException( + "No database found for the given tableIdentifier:" + tableIdentifier, e); + } catch (NoSuchTableException e) { + LOG.debug("No table currently exists, as an initial create table"); + } + return new HoodieInternalV2Table( Review Comment: does hudi catalog provides any load table functionality, which also reads the hudi logs? if yes, we can load the hudi catalog here and call the functions, it would be better if we could avoid any table format specific dependency in the client ########## plugins/spark/v3.5/spark/build.gradle.kts: ########## @@ -46,6 +46,47 @@ dependencies { // TODO: extract a polaris-rest module as a thin layer for // client to depends on. implementation(project(":polaris-core")) { isTransitive = false } + implementation(project(":polaris-api-iceberg-service")) { + // exclude the iceberg dependencies, use the ones pulled + // by iceberg-core + exclude("org.apache.iceberg", "*") + // exclude all cloud and quarkus specific dependencies to avoid + // running into problems with signature files. + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") + } + implementation(project(":polaris-api-catalog-service")) { + exclude("org.apache.iceberg", "*") + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") + } + implementation(project(":polaris-core")) { + exclude("org.apache.iceberg", "*") + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") + } + + implementation("org.apache.iceberg:iceberg-core:${icebergVersion}") + compileOnly("org.apache.hudi:hudi-spark3.5-bundle_${scalaVersion}:0.15.0") Review Comment: i don't think we need to depends on hudi bundle here, similar as how we handles delta ########## plugins/spark/v3.5/spark/build.gradle.kts: ########## @@ -46,6 +46,47 @@ dependencies { // TODO: extract a polaris-rest module as a thin layer for // client to depends on. implementation(project(":polaris-core")) { isTransitive = false } + implementation(project(":polaris-api-iceberg-service")) { + // exclude the iceberg dependencies, use the ones pulled + // by iceberg-core + exclude("org.apache.iceberg", "*") + // exclude all cloud and quarkus specific dependencies to avoid + // running into problems with signature files. + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") + } + implementation(project(":polaris-api-catalog-service")) { + exclude("org.apache.iceberg", "*") + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") + } + implementation(project(":polaris-core")) { + exclude("org.apache.iceberg", "*") + exclude("com.azure", "*") + exclude("software.amazon.awssdk", "*") + exclude("com.google.cloud", "*") + exclude("io.airlift", "*") + exclude("io.smallrye", "*") + exclude("io.smallrye.common", "*") + exclude("io.swagger", "*") + exclude("org.apache.commons", "*") + } + + implementation("org.apache.iceberg:iceberg-core:${icebergVersion}") Review Comment: we do not intend to depends on the iceberg core, it causes spark compatible issues ########## plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/SparkCatalog.java: ########## @@ -270,18 +286,24 @@ public Map<String, String> loadNamespaceMetadata(String[] namespace) public void createNamespace(String[] namespace, Map<String, String> metadata) throws NamespaceAlreadyExistsException { this.icebergsSparkCatalog.createNamespace(namespace, metadata); + HudiCatalogUtils.createNamespace(namespace, metadata); Review Comment: What is the difference between delta and hudi? I assume the hudi catalog will be used as spark session catalog and directly called for namespace operations ########## plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/utils/HudiCatalogUtils.java: ########## @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.polaris.spark.utils; + +import java.util.Map; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.catalog.NamespaceChange; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utility class for Hudi-specific catalog operations, particularly namespace synchronization + * between Polaris catalog and Spark session catalog for Hudi compatibility. + * + * <p>Hudi table loading requires namespace validation through the session catalog, but only the + * Polaris catalog contains the actual namespace metadata. This class provides methods to + * synchronize namespace operations to maintain consistency between catalogs. + */ +public class HudiCatalogUtils { + private static final Logger LOG = LoggerFactory.getLogger(HudiCatalogUtils.class); + + /** + * Synchronizes namespace creation to session catalog when Hudi extension is enabled. This ensures Review Comment: @rahil-c this part of change is actually out of my expectation, I might need some more time to understand the motivation for this part. Can you update the readme for the project with instruction about how to use hudi? I want to checkout the code and try it out. Thanks! ########## plugins/spark/v3.5/integration/build.gradle.kts: ########## @@ -60,12 +60,51 @@ dependencies { exclude("org.apache.logging.log4j", "log4j-core") exclude("org.slf4j", "jul-to-slf4j") } + + // Add spark-hive for Hudi integration - provides HiveExternalCatalog that Hudi needs + testImplementation("org.apache.spark:spark-hive_${scalaVersion}:${spark35Version}") { + // exclude log4j dependencies to match spark-sql exclusions + exclude("org.apache.logging.log4j", "log4j-slf4j2-impl") + exclude("org.apache.logging.log4j", "log4j-1.2-api") + exclude("org.apache.logging.log4j", "log4j-core") + exclude("org.slf4j", "jul-to-slf4j") + // exclude old slf4j 1.x to log4j 2.x bridge that conflicts with slf4j 2.x bridge + exclude("org.apache.logging.log4j", "log4j-slf4j-impl") + } // enforce the usage of log4j 2.24.3. This is for the log4j-api compatibility // of spark-sql dependency testRuntimeOnly("org.apache.logging.log4j:log4j-core:2.24.3") testRuntimeOnly("org.apache.logging.log4j:log4j-slf4j2-impl:2.24.3") testImplementation("io.delta:delta-spark_${scalaVersion}:3.3.1") + testImplementation("org.apache.hudi:hudi-spark3.5-bundle_${scalaVersion}:0.15.0") { + // exclude log4j dependencies to match spark-sql exclusions and prevent version conflicts Review Comment: does the bundle already contain all spark dependency needed? if that is the case, we shouldn't need the spark_hive dependency anymore, right? ########## plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/SparkCatalog.java: ########## @@ -156,15 +160,23 @@ public Table createTable( throw new UnsupportedOperationException( "Create table without location key is not supported by Polaris. Please provide location or path on table creation."); } - if (PolarisCatalogUtils.useDelta(provider)) { // For delta table, we load the delta catalog to help dealing with the // delta log creation. TableCatalog deltaCatalog = deltaHelper.loadDeltaCatalog(this.polarisSparkCatalog); return deltaCatalog.createTable(ident, schema, transforms, properties); - } else { - return this.polarisSparkCatalog.createTable(ident, schema, transforms, properties); } + if (PolarisCatalogUtils.useHudi(provider)) { + // First make a call via polaris's spark catalog + // to ensure an entity is created within the catalog and is authorized + polarisSparkCatalog.createTable(ident, schema, transforms, properties); + + // Then for actually creating the hudi table, we load HoodieCatalog + // to create the .hoodie folder in cloud storage + TableCatalog hudiCatalog = hudiHelper.loadHudiCatalog(this.polarisSparkCatalog); Review Comment: it might be better to first make sure the hudiCatalog.createTable can be done successfully first, and then create the catalog with the remote service -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@polaris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org