danny0405 commented on code in PR #6013: URL: https://github.com/apache/hudi/pull/6013#discussion_r916512804
########## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalogUtil.java: ########## @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.flink.api.java.hadoop.mapred.utils.HadoopUtils; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.StringUtils.isNullOrWhitespaceOnly; +import static org.apache.hudi.table.catalog.HoodieCatalogFactoryOptions.HIVE_SITE_FILE; + +/** + * Utilities for Hoodie Catalog. + */ +public class HoodieCatalogUtil { + private static final Logger LOG = LoggerFactory.getLogger(HoodieCatalogUtil.class); + + /** + * Returns a new hiveConfig. + * Review Comment: `hiveConfig` -> `{@code HiveConf}`. ########## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java: ########## @@ -0,0 +1,894 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieCatalogException; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.sync.common.util.ConfigUtils; +import org.apache.hudi.table.format.FilePathUtils; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.avro.Schema; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner; +import org.apache.flink.sql.parser.hive.ddl.SqlCreateHiveDatabase; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.CatalogFunction; +import org.apache.flink.table.catalog.CatalogPartition; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.CatalogPropertiesUtil; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogView; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.FunctionAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionAlreadyExistsException; +import org.apache.flink.table.catalog.exceptions.PartitionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.catalog.exceptions.TablePartitionedException; +import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; +import org.apache.flink.table.catalog.stats.CatalogTableStatistics; +import org.apache.flink.table.expressions.Expression; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.InvalidOperationException; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.PrincipalType; +import org.apache.hadoop.hive.metastore.api.SerDeInfo; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.UnknownDBException; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase.ALTER_DATABASE_OP; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_NAME; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_TYPE; +import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.util.StringUtils.isNullOrWhitespaceOnly; +import static org.apache.hudi.configuration.FlinkOptions.PATH; +import static org.apache.hudi.table.catalog.HoodieCatalogFactoryOptions.DEFAULT_DB; +import static org.apache.hudi.table.catalog.TableOptionProperties.COMMENT; +import static org.apache.hudi.table.catalog.TableOptionProperties.PK_COLUMNS; +import static org.apache.hudi.table.catalog.TableOptionProperties.PK_CONSTRAINT_NAME; +import static org.apache.hudi.table.catalog.TableOptionProperties.SPARK_SOURCE_PROVIDER; + +/** + * A catalog implementation for Hoodie based on MetaStore. + */ +public class HoodieHiveCatalog extends AbstractCatalog { + private static final Logger LOG = LoggerFactory.getLogger(HoodieHiveCatalog.class); + + private final HiveConf hiveConf; + private IMetaStoreClient client; + + public HoodieHiveCatalog(String catalogName, String defaultDatabase, String hiveConf, String hadoopConf) { + this(catalogName, defaultDatabase, HoodieCatalogUtil.createHiveConf(hiveConf, hadoopConf), false); + } + + public HoodieHiveCatalog(String catalogName, String defaultDatabase, HiveConf hiveConf, boolean allowEmbedded) { + super(catalogName, defaultDatabase == null ? DEFAULT_DB : defaultDatabase); + this.hiveConf = hiveConf; + if (!allowEmbedded) { + checkArgument( + !HoodieCatalogUtil.isEmbeddedMetastore(this.hiveConf), + "Embedded metastore is not allowed. Make sure you have set a valid value for " + + HiveConf.ConfVars.METASTOREURIS.toString()); + } + LOG.info("Created HiveCatalog '{}'", catalogName); + } Review Comment: HiveCatalog -> hoodie catalog in hms mode ########## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java: ########## @@ -0,0 +1,894 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieCatalogException; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.sync.common.util.ConfigUtils; +import org.apache.hudi.table.format.FilePathUtils; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.avro.Schema; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner; +import org.apache.flink.sql.parser.hive.ddl.SqlCreateHiveDatabase; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.CatalogFunction; +import org.apache.flink.table.catalog.CatalogPartition; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.CatalogPropertiesUtil; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogView; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.FunctionAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionAlreadyExistsException; +import org.apache.flink.table.catalog.exceptions.PartitionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.catalog.exceptions.TablePartitionedException; +import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; +import org.apache.flink.table.catalog.stats.CatalogTableStatistics; +import org.apache.flink.table.expressions.Expression; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.InvalidOperationException; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.PrincipalType; +import org.apache.hadoop.hive.metastore.api.SerDeInfo; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.UnknownDBException; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase.ALTER_DATABASE_OP; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_NAME; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_TYPE; +import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.util.StringUtils.isNullOrWhitespaceOnly; +import static org.apache.hudi.configuration.FlinkOptions.PATH; +import static org.apache.hudi.table.catalog.HoodieCatalogFactoryOptions.DEFAULT_DB; +import static org.apache.hudi.table.catalog.TableOptionProperties.COMMENT; +import static org.apache.hudi.table.catalog.TableOptionProperties.PK_COLUMNS; +import static org.apache.hudi.table.catalog.TableOptionProperties.PK_CONSTRAINT_NAME; +import static org.apache.hudi.table.catalog.TableOptionProperties.SPARK_SOURCE_PROVIDER; + +/** + * A catalog implementation for Hoodie based on MetaStore. + */ +public class HoodieHiveCatalog extends AbstractCatalog { + private static final Logger LOG = LoggerFactory.getLogger(HoodieHiveCatalog.class); + + private final HiveConf hiveConf; + private IMetaStoreClient client; + + public HoodieHiveCatalog(String catalogName, String defaultDatabase, String hiveConf, String hadoopConf) { + this(catalogName, defaultDatabase, HoodieCatalogUtil.createHiveConf(hiveConf, hadoopConf), false); + } + + public HoodieHiveCatalog(String catalogName, String defaultDatabase, HiveConf hiveConf, boolean allowEmbedded) { + super(catalogName, defaultDatabase == null ? DEFAULT_DB : defaultDatabase); + this.hiveConf = hiveConf; + if (!allowEmbedded) { + checkArgument( + !HoodieCatalogUtil.isEmbeddedMetastore(this.hiveConf), + "Embedded metastore is not allowed. Make sure you have set a valid value for " + + HiveConf.ConfVars.METASTOREURIS.toString()); + } + LOG.info("Created HiveCatalog '{}'", catalogName); + } + + @Override + public void open() throws CatalogException { + if (this.client == null) { + try { + this.client = Hive.get(hiveConf).getMSC(); + } catch (Exception e) { + throw new HoodieCatalogException("Failed to create HiveMetaStoreClient", e); + } + LOG.info("Connected to Hive metastore"); + } + if (!databaseExists(getDefaultDatabase())) { + throw new HoodieCatalogException( + String.format( + "Configured default database %s doesn't exist in catalog %s.", + getDefaultDatabase(), getName())); + } + } + + @Override + public void close() throws CatalogException { + if (client != null) { + client.close(); + client = null; + LOG.info("Close connection to Hive metastore"); + } + } + + public HiveConf getHiveConf() { + return hiveConf; + } + + // ------ databases ------ + + @Override + public List<String> listDatabases() throws CatalogException { + try { + return client.getAllDatabases(); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to list all databases in %s", getName()), e); + } + } + + public Database getHiveDatabase(String databaseName) throws DatabaseNotExistException { + try { + return client.getDatabase(databaseName); + } catch (NoSuchObjectException e) { + throw new DatabaseNotExistException(getName(), databaseName); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to get database %s from %s", databaseName, getName()), e); + } + } + + @Override + public CatalogDatabase getDatabase(String databaseName) + throws DatabaseNotExistException, CatalogException { + Database hiveDatabase = getHiveDatabase(databaseName); + + Map<String, String> properties = new HashMap<>(hiveDatabase.getParameters()); + + properties.put(SqlCreateHiveDatabase.DATABASE_LOCATION_URI, hiveDatabase.getLocationUri()); + + return new CatalogDatabaseImpl(properties, hiveDatabase.getDescription()); + } + + @Override + public boolean databaseExists(String databaseName) throws CatalogException { + try { + return client.getDatabase(databaseName) != null; + } catch (NoSuchObjectException e) { + return false; + } catch (TException e) { + throw new HoodieCatalogException( + String.format( + "Failed to determine whether database %s exists or not", databaseName), + e); + } + } + + @Override + public void createDatabase( + String databaseName, CatalogDatabase database, boolean ignoreIfExists) + throws DatabaseAlreadyExistException, CatalogException { + checkArgument( + !isNullOrWhitespaceOnly(databaseName), "databaseName cannot be null or empty"); + checkNotNull(database, "database cannot be null"); + + Map<String, String> properties = database.getProperties(); + + String dbLocationUri = properties.remove(SqlCreateHiveDatabase.DATABASE_LOCATION_URI); + + Database hiveDatabase = + new Database(databaseName, database.getComment(), dbLocationUri, properties); + + try { + client.createDatabase(hiveDatabase); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new DatabaseAlreadyExistException(getName(), hiveDatabase.getName()); + } + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to create database %s", hiveDatabase.getName()), e); + } + } + + @Override + public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) + throws DatabaseNotExistException, DatabaseNotEmptyException, CatalogException { + try { + client.dropDatabase(name, true, ignoreIfNotExists, cascade); + } catch (NoSuchObjectException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } catch (InvalidOperationException e) { + throw new DatabaseNotEmptyException(getName(), name); + } catch (TException e) { + throw new HoodieCatalogException(String.format("Failed to drop database %s", name), e); + } + } + + @Override + public void alterDatabase( + String databaseName, CatalogDatabase newDatabase, boolean ignoreIfNotExists) + throws DatabaseNotExistException, CatalogException { + checkArgument( + !isNullOrWhitespaceOnly(databaseName), "databaseName cannot be null or empty"); + checkNotNull(newDatabase, "newDatabase cannot be null"); + + // client.alterDatabase doesn't throw any exception if there is no existing database + Database hiveDB; + try { + hiveDB = getHiveDatabase(databaseName); + } catch (DatabaseNotExistException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), databaseName); + } + + return; + } + + try { + client.alterDatabase(databaseName, alterDatabase(hiveDB, newDatabase)); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to alter database %s", databaseName), e); + } + } + + private static Database alterDatabase(Database hiveDB, CatalogDatabase newDatabase) { + Map<String, String> newParams = newDatabase.getProperties(); + String opStr = newParams.remove(ALTER_DATABASE_OP); + if (opStr == null) { + // by default is to alter db properties + opStr = SqlAlterHiveDatabase.AlterHiveDatabaseOp.CHANGE_PROPS.name(); + } + String newLocation = newParams.remove(SqlCreateHiveDatabase.DATABASE_LOCATION_URI); + SqlAlterHiveDatabase.AlterHiveDatabaseOp op = + SqlAlterHiveDatabase.AlterHiveDatabaseOp.valueOf(opStr); + switch (op) { + case CHANGE_PROPS: + hiveDB.setParameters(newParams); + break; + case CHANGE_LOCATION: + hiveDB.setLocationUri(newLocation); + break; + case CHANGE_OWNER: + String ownerName = newParams.remove(DATABASE_OWNER_NAME); + String ownerType = newParams.remove(DATABASE_OWNER_TYPE); + hiveDB.setOwnerName(ownerName); + switch (ownerType) { + case SqlAlterHiveDatabaseOwner.ROLE_OWNER: + hiveDB.setOwnerType(PrincipalType.ROLE); + break; + case SqlAlterHiveDatabaseOwner.USER_OWNER: + hiveDB.setOwnerType(PrincipalType.USER); + break; + default: + throw new CatalogException("Unsupported database owner type: " + ownerType); + } + break; + default: + throw new CatalogException("Unsupported alter database op:" + opStr); + } + // is_generic is deprecated, remove it + if (hiveDB.getParameters() != null) { + hiveDB.getParameters().remove(CatalogPropertiesUtil.IS_GENERIC); + } + return hiveDB; + } + + // ------ tables ------ + + private Table checkHoodieTable(Table hiveTable) { + if (!hiveTable.getParameters().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi") + && !hiveTable.getParameters().getOrDefault(SPARK_SOURCE_PROVIDER, "").equalsIgnoreCase("hudi")) { + throw new HoodieCatalogException(String.format("the %s is not hoodie table", hiveTable.getTableName())); + } + return hiveTable; + } + + private boolean isFlinkTable(Table hiveTable) { + return hiveTable.getParameters().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi"); + } + + private org.apache.avro.Schema getLatestTableSchema(String path) { + if (StringUtils.isNullOrEmpty(path) || !StreamerUtil.tableExists(path, hiveConf)) { + return null; + } + + try { + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(path, hiveConf); + return new TableSchemaResolver(metaClient).getTableAvroSchema(false); + } catch (Exception e) { + LOG.warn("Error while resolving the latest table schema", e); + } + return null; + } + + @VisibleForTesting + public Table getHiveTable(ObjectPath tablePath) throws TableNotExistException { + try { + Table hiveTable = client.getTable(tablePath.getDatabaseName(), tablePath.getObjectName()); + return checkHoodieTable(hiveTable); + } catch (NoSuchObjectException e) { + throw new TableNotExistException(getName(), tablePath); + } catch (TException e) { + throw new HoodieCatalogException(String.format("Failed to get table %s from Hive metastore", tablePath.getObjectName())); + } + } + + private Table translateSparkTable2Flink(ObjectPath tablePath, Table hiveTable) { + if (!isFlinkTable(hiveTable)) { + try { + Map<String, String> parameters = hiveTable.getParameters(); + parameters.putAll(TableOptionProperties.translateSparkTableProperties2Flink(hiveTable)); + String path = hiveTable.getSd().getLocation(); Review Comment: What about the table is not a hudi table at all, do we still need the translation ? ########## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java: ########## @@ -0,0 +1,894 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieCatalogException; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.sync.common.util.ConfigUtils; +import org.apache.hudi.table.format.FilePathUtils; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.avro.Schema; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner; +import org.apache.flink.sql.parser.hive.ddl.SqlCreateHiveDatabase; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.CatalogFunction; +import org.apache.flink.table.catalog.CatalogPartition; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.CatalogPropertiesUtil; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogView; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.FunctionAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionAlreadyExistsException; +import org.apache.flink.table.catalog.exceptions.PartitionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.catalog.exceptions.TablePartitionedException; +import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; +import org.apache.flink.table.catalog.stats.CatalogTableStatistics; +import org.apache.flink.table.expressions.Expression; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.InvalidOperationException; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.PrincipalType; +import org.apache.hadoop.hive.metastore.api.SerDeInfo; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.UnknownDBException; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase.ALTER_DATABASE_OP; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_NAME; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_TYPE; +import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.util.StringUtils.isNullOrWhitespaceOnly; +import static org.apache.hudi.configuration.FlinkOptions.PATH; +import static org.apache.hudi.table.catalog.HoodieCatalogFactoryOptions.DEFAULT_DB; +import static org.apache.hudi.table.catalog.TableOptionProperties.COMMENT; +import static org.apache.hudi.table.catalog.TableOptionProperties.PK_COLUMNS; +import static org.apache.hudi.table.catalog.TableOptionProperties.PK_CONSTRAINT_NAME; +import static org.apache.hudi.table.catalog.TableOptionProperties.SPARK_SOURCE_PROVIDER; + +/** + * A catalog implementation for Hoodie based on MetaStore. + */ +public class HoodieHiveCatalog extends AbstractCatalog { + private static final Logger LOG = LoggerFactory.getLogger(HoodieHiveCatalog.class); + + private final HiveConf hiveConf; + private IMetaStoreClient client; + + public HoodieHiveCatalog(String catalogName, String defaultDatabase, String hiveConf, String hadoopConf) { + this(catalogName, defaultDatabase, HoodieCatalogUtil.createHiveConf(hiveConf, hadoopConf), false); + } + + public HoodieHiveCatalog(String catalogName, String defaultDatabase, HiveConf hiveConf, boolean allowEmbedded) { + super(catalogName, defaultDatabase == null ? DEFAULT_DB : defaultDatabase); + this.hiveConf = hiveConf; + if (!allowEmbedded) { + checkArgument( + !HoodieCatalogUtil.isEmbeddedMetastore(this.hiveConf), + "Embedded metastore is not allowed. Make sure you have set a valid value for " + + HiveConf.ConfVars.METASTOREURIS.toString()); + } + LOG.info("Created HiveCatalog '{}'", catalogName); + } + + @Override + public void open() throws CatalogException { + if (this.client == null) { + try { + this.client = Hive.get(hiveConf).getMSC(); + } catch (Exception e) { + throw new HoodieCatalogException("Failed to create HiveMetaStoreClient", e); + } + LOG.info("Connected to Hive metastore"); + } + if (!databaseExists(getDefaultDatabase())) { + throw new HoodieCatalogException( + String.format( + "Configured default database %s doesn't exist in catalog %s.", + getDefaultDatabase(), getName())); + } + } + + @Override + public void close() throws CatalogException { + if (client != null) { + client.close(); + client = null; + LOG.info("Close connection to Hive metastore"); + } + } + + public HiveConf getHiveConf() { + return hiveConf; + } + + // ------ databases ------ + + @Override + public List<String> listDatabases() throws CatalogException { + try { + return client.getAllDatabases(); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to list all databases in %s", getName()), e); + } + } + + public Database getHiveDatabase(String databaseName) throws DatabaseNotExistException { + try { + return client.getDatabase(databaseName); + } catch (NoSuchObjectException e) { + throw new DatabaseNotExistException(getName(), databaseName); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to get database %s from %s", databaseName, getName()), e); + } + } + + @Override + public CatalogDatabase getDatabase(String databaseName) + throws DatabaseNotExistException, CatalogException { + Database hiveDatabase = getHiveDatabase(databaseName); + + Map<String, String> properties = new HashMap<>(hiveDatabase.getParameters()); + + properties.put(SqlCreateHiveDatabase.DATABASE_LOCATION_URI, hiveDatabase.getLocationUri()); + + return new CatalogDatabaseImpl(properties, hiveDatabase.getDescription()); + } + + @Override + public boolean databaseExists(String databaseName) throws CatalogException { + try { + return client.getDatabase(databaseName) != null; + } catch (NoSuchObjectException e) { + return false; + } catch (TException e) { + throw new HoodieCatalogException( + String.format( + "Failed to determine whether database %s exists or not", databaseName), + e); + } + } + + @Override + public void createDatabase( + String databaseName, CatalogDatabase database, boolean ignoreIfExists) + throws DatabaseAlreadyExistException, CatalogException { + checkArgument( + !isNullOrWhitespaceOnly(databaseName), "databaseName cannot be null or empty"); + checkNotNull(database, "database cannot be null"); + + Map<String, String> properties = database.getProperties(); + + String dbLocationUri = properties.remove(SqlCreateHiveDatabase.DATABASE_LOCATION_URI); + + Database hiveDatabase = + new Database(databaseName, database.getComment(), dbLocationUri, properties); + + try { + client.createDatabase(hiveDatabase); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new DatabaseAlreadyExistException(getName(), hiveDatabase.getName()); + } + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to create database %s", hiveDatabase.getName()), e); + } + } + + @Override + public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) + throws DatabaseNotExistException, DatabaseNotEmptyException, CatalogException { + try { + client.dropDatabase(name, true, ignoreIfNotExists, cascade); + } catch (NoSuchObjectException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } catch (InvalidOperationException e) { + throw new DatabaseNotEmptyException(getName(), name); + } catch (TException e) { + throw new HoodieCatalogException(String.format("Failed to drop database %s", name), e); + } + } + + @Override + public void alterDatabase( + String databaseName, CatalogDatabase newDatabase, boolean ignoreIfNotExists) + throws DatabaseNotExistException, CatalogException { + checkArgument( + !isNullOrWhitespaceOnly(databaseName), "databaseName cannot be null or empty"); + checkNotNull(newDatabase, "newDatabase cannot be null"); + + // client.alterDatabase doesn't throw any exception if there is no existing database + Database hiveDB; + try { + hiveDB = getHiveDatabase(databaseName); + } catch (DatabaseNotExistException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), databaseName); + } + + return; + } + + try { + client.alterDatabase(databaseName, alterDatabase(hiveDB, newDatabase)); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to alter database %s", databaseName), e); + } + } + + private static Database alterDatabase(Database hiveDB, CatalogDatabase newDatabase) { + Map<String, String> newParams = newDatabase.getProperties(); + String opStr = newParams.remove(ALTER_DATABASE_OP); + if (opStr == null) { + // by default is to alter db properties + opStr = SqlAlterHiveDatabase.AlterHiveDatabaseOp.CHANGE_PROPS.name(); + } + String newLocation = newParams.remove(SqlCreateHiveDatabase.DATABASE_LOCATION_URI); + SqlAlterHiveDatabase.AlterHiveDatabaseOp op = + SqlAlterHiveDatabase.AlterHiveDatabaseOp.valueOf(opStr); + switch (op) { + case CHANGE_PROPS: + hiveDB.setParameters(newParams); + break; + case CHANGE_LOCATION: + hiveDB.setLocationUri(newLocation); + break; + case CHANGE_OWNER: + String ownerName = newParams.remove(DATABASE_OWNER_NAME); + String ownerType = newParams.remove(DATABASE_OWNER_TYPE); + hiveDB.setOwnerName(ownerName); + switch (ownerType) { + case SqlAlterHiveDatabaseOwner.ROLE_OWNER: + hiveDB.setOwnerType(PrincipalType.ROLE); + break; + case SqlAlterHiveDatabaseOwner.USER_OWNER: + hiveDB.setOwnerType(PrincipalType.USER); + break; + default: + throw new CatalogException("Unsupported database owner type: " + ownerType); + } + break; + default: + throw new CatalogException("Unsupported alter database op:" + opStr); + } + // is_generic is deprecated, remove it + if (hiveDB.getParameters() != null) { + hiveDB.getParameters().remove(CatalogPropertiesUtil.IS_GENERIC); + } + return hiveDB; + } + + // ------ tables ------ + + private Table checkHoodieTable(Table hiveTable) { + if (!hiveTable.getParameters().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi") + && !hiveTable.getParameters().getOrDefault(SPARK_SOURCE_PROVIDER, "").equalsIgnoreCase("hudi")) { + throw new HoodieCatalogException(String.format("the %s is not hoodie table", hiveTable.getTableName())); + } + return hiveTable; + } + + private boolean isFlinkTable(Table hiveTable) { + return hiveTable.getParameters().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi"); + } + + private org.apache.avro.Schema getLatestTableSchema(String path) { + if (StringUtils.isNullOrEmpty(path) || !StreamerUtil.tableExists(path, hiveConf)) { + return null; + } + + try { + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(path, hiveConf); + return new TableSchemaResolver(metaClient).getTableAvroSchema(false); + } catch (Exception e) { + LOG.warn("Error while resolving the latest table schema", e); + } + return null; + } + + @VisibleForTesting + public Table getHiveTable(ObjectPath tablePath) throws TableNotExistException { + try { + Table hiveTable = client.getTable(tablePath.getDatabaseName(), tablePath.getObjectName()); + return checkHoodieTable(hiveTable); + } catch (NoSuchObjectException e) { + throw new TableNotExistException(getName(), tablePath); + } catch (TException e) { + throw new HoodieCatalogException(String.format("Failed to get table %s from Hive metastore", tablePath.getObjectName())); + } + } + + private Table translateSparkTable2Flink(ObjectPath tablePath, Table hiveTable) { + if (!isFlinkTable(hiveTable)) { + try { + Map<String, String> parameters = hiveTable.getParameters(); + parameters.putAll(TableOptionProperties.translateSparkTableProperties2Flink(hiveTable)); + String path = hiveTable.getSd().getLocation(); + parameters.put(PATH.key(), path); + if (!parameters.containsKey(FlinkOptions.HIVE_STYLE_PARTITIONING.key())) { + Path hoodieTablePath = new Path(path); + boolean hiveStyle = Arrays.stream(FSUtils.getFs(hoodieTablePath, hiveConf).listStatus(hoodieTablePath)) + .map(fileStatus -> fileStatus.getPath().getName()) + .filter(f -> !f.equals(".hoodie") && !f.equals("default")) + .anyMatch(FilePathUtils::hiveStylePartitionMath); + parameters.put(FlinkOptions.HIVE_STYLE_PARTITIONING.key(), String.valueOf(hiveStyle)); + } + client.alter_table(tablePath.getDatabaseName(), tablePath.getObjectName(), hiveTable); + } catch (Exception e) { + throw new HoodieCatalogException("Failed to update table schema", e); + } + } + return hiveTable; + } + + @Override + public CatalogBaseTable getTable(ObjectPath tablePath) throws TableNotExistException, CatalogException { + checkNotNull(tablePath, "tablePath cannot be null"); + Table hiveTable = getHiveTable(tablePath); + hiveTable = translateSparkTable2Flink(tablePath, hiveTable); + String path = hiveTable.getSd().getLocation(); + Map<String, String> parameters = hiveTable.getParameters(); + Schema latestTableSchema = getLatestTableSchema(path); + org.apache.flink.table.api.Schema schema; + if (latestTableSchema != null) { + org.apache.flink.table.api.Schema.Builder builder = org.apache.flink.table.api.Schema.newBuilder() + .fromRowDataType(AvroSchemaConverter.convertToDataType(latestTableSchema)); + String pkConstraintName = parameters.get(PK_CONSTRAINT_NAME); + if (!StringUtils.isNullOrEmpty(pkConstraintName)) { + builder.primaryKeyNamed(pkConstraintName, StringUtils.split(parameters.get(PK_COLUMNS), ",")); + } + schema = builder.build(); + } else { + LOG.warn("{} does not have any hoodie schema, and use hive table to covert the catalogBaseTable", tablePath); + schema = TableOptionProperties.convertTableSchema(hiveTable); Review Comment: use hive table to covert the catalogBaseTable -> use hive table schema to infer the table schema instead ########## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java: ########## @@ -0,0 +1,894 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieCatalogException; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.sync.common.util.ConfigUtils; +import org.apache.hudi.table.format.FilePathUtils; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.avro.Schema; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner; +import org.apache.flink.sql.parser.hive.ddl.SqlCreateHiveDatabase; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.CatalogFunction; +import org.apache.flink.table.catalog.CatalogPartition; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.CatalogPropertiesUtil; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogView; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.FunctionAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionAlreadyExistsException; +import org.apache.flink.table.catalog.exceptions.PartitionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.catalog.exceptions.TablePartitionedException; +import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; +import org.apache.flink.table.catalog.stats.CatalogTableStatistics; +import org.apache.flink.table.expressions.Expression; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.InvalidOperationException; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.PrincipalType; +import org.apache.hadoop.hive.metastore.api.SerDeInfo; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.UnknownDBException; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase.ALTER_DATABASE_OP; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_NAME; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_TYPE; +import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.util.StringUtils.isNullOrWhitespaceOnly; +import static org.apache.hudi.configuration.FlinkOptions.PATH; +import static org.apache.hudi.table.catalog.HoodieCatalogFactoryOptions.DEFAULT_DB; +import static org.apache.hudi.table.catalog.TableOptionProperties.COMMENT; +import static org.apache.hudi.table.catalog.TableOptionProperties.PK_COLUMNS; +import static org.apache.hudi.table.catalog.TableOptionProperties.PK_CONSTRAINT_NAME; +import static org.apache.hudi.table.catalog.TableOptionProperties.SPARK_SOURCE_PROVIDER; + +/** + * A catalog implementation for Hoodie based on MetaStore. + */ +public class HoodieHiveCatalog extends AbstractCatalog { + private static final Logger LOG = LoggerFactory.getLogger(HoodieHiveCatalog.class); + + private final HiveConf hiveConf; + private IMetaStoreClient client; + + public HoodieHiveCatalog(String catalogName, String defaultDatabase, String hiveConf, String hadoopConf) { + this(catalogName, defaultDatabase, HoodieCatalogUtil.createHiveConf(hiveConf, hadoopConf), false); + } + + public HoodieHiveCatalog(String catalogName, String defaultDatabase, HiveConf hiveConf, boolean allowEmbedded) { + super(catalogName, defaultDatabase == null ? DEFAULT_DB : defaultDatabase); + this.hiveConf = hiveConf; + if (!allowEmbedded) { + checkArgument( + !HoodieCatalogUtil.isEmbeddedMetastore(this.hiveConf), + "Embedded metastore is not allowed. Make sure you have set a valid value for " + + HiveConf.ConfVars.METASTOREURIS.toString()); + } + LOG.info("Created HiveCatalog '{}'", catalogName); + } + + @Override + public void open() throws CatalogException { + if (this.client == null) { + try { + this.client = Hive.get(hiveConf).getMSC(); + } catch (Exception e) { + throw new HoodieCatalogException("Failed to create HiveMetaStoreClient", e); + } + LOG.info("Connected to Hive metastore"); + } + if (!databaseExists(getDefaultDatabase())) { + throw new HoodieCatalogException( + String.format( + "Configured default database %s doesn't exist in catalog %s.", + getDefaultDatabase(), getName())); + } + } + + @Override + public void close() throws CatalogException { + if (client != null) { + client.close(); + client = null; + LOG.info("Close connection to Hive metastore"); + } + } + + public HiveConf getHiveConf() { + return hiveConf; + } + + // ------ databases ------ + + @Override + public List<String> listDatabases() throws CatalogException { + try { + return client.getAllDatabases(); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to list all databases in %s", getName()), e); + } + } + + public Database getHiveDatabase(String databaseName) throws DatabaseNotExistException { + try { + return client.getDatabase(databaseName); + } catch (NoSuchObjectException e) { + throw new DatabaseNotExistException(getName(), databaseName); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to get database %s from %s", databaseName, getName()), e); + } + } + + @Override + public CatalogDatabase getDatabase(String databaseName) + throws DatabaseNotExistException, CatalogException { + Database hiveDatabase = getHiveDatabase(databaseName); + + Map<String, String> properties = new HashMap<>(hiveDatabase.getParameters()); + + properties.put(SqlCreateHiveDatabase.DATABASE_LOCATION_URI, hiveDatabase.getLocationUri()); + + return new CatalogDatabaseImpl(properties, hiveDatabase.getDescription()); + } + + @Override + public boolean databaseExists(String databaseName) throws CatalogException { + try { + return client.getDatabase(databaseName) != null; + } catch (NoSuchObjectException e) { + return false; + } catch (TException e) { + throw new HoodieCatalogException( + String.format( + "Failed to determine whether database %s exists or not", databaseName), + e); + } + } + + @Override + public void createDatabase( + String databaseName, CatalogDatabase database, boolean ignoreIfExists) + throws DatabaseAlreadyExistException, CatalogException { + checkArgument( + !isNullOrWhitespaceOnly(databaseName), "databaseName cannot be null or empty"); + checkNotNull(database, "database cannot be null"); + + Map<String, String> properties = database.getProperties(); + + String dbLocationUri = properties.remove(SqlCreateHiveDatabase.DATABASE_LOCATION_URI); + + Database hiveDatabase = + new Database(databaseName, database.getComment(), dbLocationUri, properties); + + try { + client.createDatabase(hiveDatabase); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new DatabaseAlreadyExistException(getName(), hiveDatabase.getName()); + } + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to create database %s", hiveDatabase.getName()), e); + } + } + + @Override + public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) + throws DatabaseNotExistException, DatabaseNotEmptyException, CatalogException { + try { + client.dropDatabase(name, true, ignoreIfNotExists, cascade); + } catch (NoSuchObjectException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } catch (InvalidOperationException e) { + throw new DatabaseNotEmptyException(getName(), name); + } catch (TException e) { + throw new HoodieCatalogException(String.format("Failed to drop database %s", name), e); + } + } + + @Override + public void alterDatabase( + String databaseName, CatalogDatabase newDatabase, boolean ignoreIfNotExists) + throws DatabaseNotExistException, CatalogException { + checkArgument( + !isNullOrWhitespaceOnly(databaseName), "databaseName cannot be null or empty"); + checkNotNull(newDatabase, "newDatabase cannot be null"); + + // client.alterDatabase doesn't throw any exception if there is no existing database + Database hiveDB; + try { + hiveDB = getHiveDatabase(databaseName); + } catch (DatabaseNotExistException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), databaseName); + } + + return; + } + + try { + client.alterDatabase(databaseName, alterDatabase(hiveDB, newDatabase)); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to alter database %s", databaseName), e); + } + } + + private static Database alterDatabase(Database hiveDB, CatalogDatabase newDatabase) { + Map<String, String> newParams = newDatabase.getProperties(); + String opStr = newParams.remove(ALTER_DATABASE_OP); + if (opStr == null) { + // by default is to alter db properties + opStr = SqlAlterHiveDatabase.AlterHiveDatabaseOp.CHANGE_PROPS.name(); + } + String newLocation = newParams.remove(SqlCreateHiveDatabase.DATABASE_LOCATION_URI); + SqlAlterHiveDatabase.AlterHiveDatabaseOp op = + SqlAlterHiveDatabase.AlterHiveDatabaseOp.valueOf(opStr); + switch (op) { + case CHANGE_PROPS: + hiveDB.setParameters(newParams); + break; + case CHANGE_LOCATION: + hiveDB.setLocationUri(newLocation); + break; + case CHANGE_OWNER: + String ownerName = newParams.remove(DATABASE_OWNER_NAME); + String ownerType = newParams.remove(DATABASE_OWNER_TYPE); + hiveDB.setOwnerName(ownerName); + switch (ownerType) { + case SqlAlterHiveDatabaseOwner.ROLE_OWNER: + hiveDB.setOwnerType(PrincipalType.ROLE); + break; + case SqlAlterHiveDatabaseOwner.USER_OWNER: + hiveDB.setOwnerType(PrincipalType.USER); + break; + default: + throw new CatalogException("Unsupported database owner type: " + ownerType); + } + break; + default: + throw new CatalogException("Unsupported alter database op:" + opStr); + } + // is_generic is deprecated, remove it + if (hiveDB.getParameters() != null) { + hiveDB.getParameters().remove(CatalogPropertiesUtil.IS_GENERIC); + } + return hiveDB; + } + + // ------ tables ------ + + private Table checkHoodieTable(Table hiveTable) { + if (!hiveTable.getParameters().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi") + && !hiveTable.getParameters().getOrDefault(SPARK_SOURCE_PROVIDER, "").equalsIgnoreCase("hudi")) { + throw new HoodieCatalogException(String.format("the %s is not hoodie table", hiveTable.getTableName())); + } + return hiveTable; + } + + private boolean isFlinkTable(Table hiveTable) { + return hiveTable.getParameters().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi"); + } + + private org.apache.avro.Schema getLatestTableSchema(String path) { + if (StringUtils.isNullOrEmpty(path) || !StreamerUtil.tableExists(path, hiveConf)) { + return null; + } + + try { + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(path, hiveConf); + return new TableSchemaResolver(metaClient).getTableAvroSchema(false); + } catch (Exception e) { + LOG.warn("Error while resolving the latest table schema", e); + } + return null; + } + + @VisibleForTesting + public Table getHiveTable(ObjectPath tablePath) throws TableNotExistException { + try { + Table hiveTable = client.getTable(tablePath.getDatabaseName(), tablePath.getObjectName()); + return checkHoodieTable(hiveTable); + } catch (NoSuchObjectException e) { + throw new TableNotExistException(getName(), tablePath); + } catch (TException e) { + throw new HoodieCatalogException(String.format("Failed to get table %s from Hive metastore", tablePath.getObjectName())); + } + } + + private Table translateSparkTable2Flink(ObjectPath tablePath, Table hiveTable) { + if (!isFlinkTable(hiveTable)) { + try { + Map<String, String> parameters = hiveTable.getParameters(); + parameters.putAll(TableOptionProperties.translateSparkTableProperties2Flink(hiveTable)); + String path = hiveTable.getSd().getLocation(); + parameters.put(PATH.key(), path); + if (!parameters.containsKey(FlinkOptions.HIVE_STYLE_PARTITIONING.key())) { + Path hoodieTablePath = new Path(path); + boolean hiveStyle = Arrays.stream(FSUtils.getFs(hoodieTablePath, hiveConf).listStatus(hoodieTablePath)) + .map(fileStatus -> fileStatus.getPath().getName()) + .filter(f -> !f.equals(".hoodie") && !f.equals("default")) + .anyMatch(FilePathUtils::hiveStylePartitionMath); Review Comment: hiveStylePartitionMath -> did you mean isHiveStylePartitioning ? ########## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalogUtil.java: ########## @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.flink.api.java.hadoop.mapred.utils.HadoopUtils; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.StringUtils.isNullOrWhitespaceOnly; +import static org.apache.hudi.table.catalog.HoodieCatalogFactoryOptions.HIVE_SITE_FILE; + +/** + * Utilities for Hoodie Catalog. + */ +public class HoodieCatalogUtil { + private static final Logger LOG = LoggerFactory.getLogger(HoodieCatalogUtil.class); + + /** + * Returns a new hiveConfig. + * + * @param hiveConfDir Hive conf directory path. + * @param hadoopConfDir Hadoop conf directory path. + * @return A HiveConf instance. + */ + public static HiveConf createHiveConf(@Nullable String hiveConfDir, @Nullable String hadoopConfDir) { + // create HiveConf from hadoop configuration with hadoop conf directory configured. + Configuration hadoopConf = null; + if (isNullOrWhitespaceOnly(hadoopConfDir)) { + for (String possibleHadoopConfPath : + HadoopUtils.possibleHadoopConfPaths( + new org.apache.flink.configuration.Configuration())) { + hadoopConf = getHadoopConfiguration(possibleHadoopConfPath); + if (hadoopConf != null) { + break; + } + } + } else { + hadoopConf = getHadoopConfiguration(hadoopConfDir); + if (hadoopConf == null) { + String possiableUsedConfFiles = + "core-site.xml | hdfs-site.xml | yarn-site.xml | mapred-site.xml"; + throw new CatalogException( + "Failed to load the hadoop conf from specified path:" + hadoopConfDir, + new FileNotFoundException( + "Please check the path none of the conf files (" + + possiableUsedConfFiles + + ") exist in the folder.")); + } + } + if (hadoopConf == null) { + hadoopConf = new Configuration(); + } + // ignore all the static conf file URLs that HiveConf may have set + HiveConf.setHiveSiteLocation(null); + HiveConf.setLoadMetastoreConfig(false); + HiveConf.setLoadHiveServer2Config(false); + HiveConf hiveConf = new HiveConf(hadoopConf, HiveConf.class); + + LOG.info("Setting hive conf dir as {}", hiveConfDir); + + if (hiveConfDir != null) { + Path hiveSite = new Path(hiveConfDir, HIVE_SITE_FILE); + if (!hiveSite.toUri().isAbsolute()) { + // treat relative URI as local file to be compatible with previous behavior + hiveSite = new Path(new File(hiveSite.toString()).toURI()); + } + try (InputStream inputStream = hiveSite.getFileSystem(hadoopConf).open(hiveSite)) { + hiveConf.addResource(inputStream, hiveSite.toString()); + // trigger a read from the conf so that the input stream is read + isEmbeddedMetastore(hiveConf); + } catch (IOException e) { + throw new CatalogException( + "Failed to load hive-site.xml from specified path:" + hiveSite, e); + } + } else { + // user doesn't provide hive conf dir, we try to find it in classpath + URL hiveSite = + Thread.currentThread().getContextClassLoader().getResource(HIVE_SITE_FILE); + if (hiveSite != null) { + LOG.info("Found {} in classpath: {}", HIVE_SITE_FILE, hiveSite); + hiveConf.addResource(hiveSite); + } + } + return hiveConf; + } + + /** + * Check whether the hive.metastore.uris is empty + */ + public static boolean isEmbeddedMetastore(HiveConf hiveConf) { + return isNullOrWhitespaceOnly(hiveConf.getVar(HiveConf.ConfVars.METASTOREURIS)); + } + + /** + * Returns a new Hadoop Configuration object using the path to the hadoop conf configured. + * + * @param hadoopConfDir Hadoop conf directory path. + * @return A Hadoop configuration instance. + */ + public static Configuration getHadoopConfiguration(String hadoopConfDir) { + if (new File(hadoopConfDir).exists()) { + List<File> possiableConfFiles = new ArrayList<File>(); + File coreSite = new File(hadoopConfDir, "core-site.xml"); + if (coreSite.exists()) { + possiableConfFiles.add(coreSite); + } + File hdfsSite = new File(hadoopConfDir, "hdfs-site.xml"); + if (hdfsSite.exists()) { + possiableConfFiles.add(hdfsSite); + } + File yarnSite = new File(hadoopConfDir, "yarn-site.xml"); + if (yarnSite.exists()) { + possiableConfFiles.add(yarnSite); + } + // Add mapred-site.xml. We need to read configurations like compression codec. Review Comment: Did you know that we already have tool clazz for hive/hadoop configurations in `HadoopConfugurations` ? ########## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java: ########## @@ -442,4 +442,8 @@ public static String[] extractHivePartitionFields(org.apache.flink.configuration } return conf.getString(FlinkOptions.HIVE_SYNC_PARTITION_FIELDS).split(","); } + + public static boolean hiveStylePartitionMath(String path) { + return HIVE_PARTITION_NAME_PATTERN.matcher(path).matches(); Review Comment: hiveStylePartitionMath -> isHiveStylePartitioning ########## hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieHiveCatalog.java: ########## @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieCatalogException; + +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogTableImpl; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.hadoop.hive.metastore.api.Table; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Test cases for {@link HoodieHiveCatalog}. + */ +public class TestHoodieHiveCatalog { + TableSchema schema = + TableSchema.builder() + .field("name", DataTypes.STRING()) + .field("age", DataTypes.INT()) + .build(); + private static HoodieHiveCatalog hoodieCatalog; + private final ObjectPath tablePath = new ObjectPath("default", "test"); + + @BeforeAll + public static void createCatalog() { + hoodieCatalog = TestHoodieCatalogUtils.createHiveCatalog(); + hoodieCatalog.open(); + } + + @AfterEach + public void dropTable() throws TableNotExistException { + hoodieCatalog.dropTable(tablePath, true); + } + + @AfterAll + public static void closeCatalog() { + if (hoodieCatalog != null) { + hoodieCatalog.close(); + } + } + + @Test + public void testCreateAndGetHoodieMORTable() throws Exception { + Map<String, String> originOptions = new HashMap<>(); + originOptions.put(FactoryUtil.CONNECTOR.key(), "hudi"); + originOptions.put(FlinkOptions.TABLE_TYPE.key(), FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + CatalogTable table = + new CatalogTableImpl(schema, originOptions, "hudi table"); + hoodieCatalog.createTable(tablePath, table, false); + CatalogBaseTable table1 = hoodieCatalog.getTable(tablePath); + assertEquals(table1.getOptions().get(CONNECTOR.key()), "hudi"); + assertEquals(table1.getOptions().get(FlinkOptions.TABLE_TYPE.key()), FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + } + + @Test + public void testCreateAndGetHoodieTable() throws Exception { + Map<String, String> originOptions = + Collections.singletonMap(FactoryUtil.CONNECTOR.key(), "hudi"); + CatalogTable table = + new CatalogTableImpl(schema, originOptions, "hudi table"); + hoodieCatalog.createTable(tablePath, table, false); + CatalogBaseTable table1 = hoodieCatalog.getTable(tablePath); + assertEquals(table1.getOptions().get(CONNECTOR.key()), "hudi"); + } + + @Test + public void testCreateNonHoodieTable() throws TableAlreadyExistException, DatabaseNotExistException { + CatalogTable table = + new CatalogTableImpl(schema, Collections.emptyMap(), "hudi table"); + try { + hoodieCatalog.createTable(tablePath, table, false); Review Comment: We also need to add some test cases in `ITTestHoodieDataSource` to use the catalog to read and write. ########## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieHiveCatalog.java: ########## @@ -0,0 +1,894 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieCatalogException; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.sync.common.util.ConfigUtils; +import org.apache.hudi.table.format.FilePathUtils; +import org.apache.hudi.util.AvroSchemaConverter; +import org.apache.hudi.util.StreamerUtil; + +import org.apache.avro.Schema; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase; +import org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner; +import org.apache.flink.sql.parser.hive.ddl.SqlCreateHiveDatabase; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.CatalogFunction; +import org.apache.flink.table.catalog.CatalogPartition; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.CatalogPropertiesUtil; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogView; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.FunctionAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionAlreadyExistsException; +import org.apache.flink.table.catalog.exceptions.PartitionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.catalog.exceptions.TablePartitionedException; +import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; +import org.apache.flink.table.catalog.stats.CatalogTableStatistics; +import org.apache.flink.table.expressions.Expression; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.InvalidOperationException; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.PrincipalType; +import org.apache.hadoop.hive.metastore.api.SerDeInfo; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.UnknownDBException; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabase.ALTER_DATABASE_OP; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_NAME; +import static org.apache.flink.sql.parser.hive.ddl.SqlAlterHiveDatabaseOwner.DATABASE_OWNER_TYPE; +import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.util.StringUtils.isNullOrWhitespaceOnly; +import static org.apache.hudi.configuration.FlinkOptions.PATH; +import static org.apache.hudi.table.catalog.HoodieCatalogFactoryOptions.DEFAULT_DB; +import static org.apache.hudi.table.catalog.TableOptionProperties.COMMENT; +import static org.apache.hudi.table.catalog.TableOptionProperties.PK_COLUMNS; +import static org.apache.hudi.table.catalog.TableOptionProperties.PK_CONSTRAINT_NAME; +import static org.apache.hudi.table.catalog.TableOptionProperties.SPARK_SOURCE_PROVIDER; + +/** + * A catalog implementation for Hoodie based on MetaStore. + */ +public class HoodieHiveCatalog extends AbstractCatalog { + private static final Logger LOG = LoggerFactory.getLogger(HoodieHiveCatalog.class); + + private final HiveConf hiveConf; + private IMetaStoreClient client; + + public HoodieHiveCatalog(String catalogName, String defaultDatabase, String hiveConf, String hadoopConf) { + this(catalogName, defaultDatabase, HoodieCatalogUtil.createHiveConf(hiveConf, hadoopConf), false); + } + + public HoodieHiveCatalog(String catalogName, String defaultDatabase, HiveConf hiveConf, boolean allowEmbedded) { + super(catalogName, defaultDatabase == null ? DEFAULT_DB : defaultDatabase); + this.hiveConf = hiveConf; + if (!allowEmbedded) { + checkArgument( + !HoodieCatalogUtil.isEmbeddedMetastore(this.hiveConf), + "Embedded metastore is not allowed. Make sure you have set a valid value for " + + HiveConf.ConfVars.METASTOREURIS.toString()); + } + LOG.info("Created HiveCatalog '{}'", catalogName); + } + + @Override + public void open() throws CatalogException { + if (this.client == null) { + try { + this.client = Hive.get(hiveConf).getMSC(); + } catch (Exception e) { + throw new HoodieCatalogException("Failed to create HiveMetaStoreClient", e); + } + LOG.info("Connected to Hive metastore"); + } + if (!databaseExists(getDefaultDatabase())) { + throw new HoodieCatalogException( + String.format( + "Configured default database %s doesn't exist in catalog %s.", + getDefaultDatabase(), getName())); + } + } + + @Override + public void close() throws CatalogException { + if (client != null) { + client.close(); + client = null; + LOG.info("Close connection to Hive metastore"); + } + } + + public HiveConf getHiveConf() { + return hiveConf; + } + + // ------ databases ------ + + @Override + public List<String> listDatabases() throws CatalogException { + try { + return client.getAllDatabases(); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to list all databases in %s", getName()), e); + } + } + + public Database getHiveDatabase(String databaseName) throws DatabaseNotExistException { + try { + return client.getDatabase(databaseName); + } catch (NoSuchObjectException e) { + throw new DatabaseNotExistException(getName(), databaseName); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to get database %s from %s", databaseName, getName()), e); + } + } + + @Override + public CatalogDatabase getDatabase(String databaseName) + throws DatabaseNotExistException, CatalogException { + Database hiveDatabase = getHiveDatabase(databaseName); + + Map<String, String> properties = new HashMap<>(hiveDatabase.getParameters()); + + properties.put(SqlCreateHiveDatabase.DATABASE_LOCATION_URI, hiveDatabase.getLocationUri()); + + return new CatalogDatabaseImpl(properties, hiveDatabase.getDescription()); + } + + @Override + public boolean databaseExists(String databaseName) throws CatalogException { + try { + return client.getDatabase(databaseName) != null; + } catch (NoSuchObjectException e) { + return false; + } catch (TException e) { + throw new HoodieCatalogException( + String.format( + "Failed to determine whether database %s exists or not", databaseName), + e); + } + } + + @Override + public void createDatabase( + String databaseName, CatalogDatabase database, boolean ignoreIfExists) + throws DatabaseAlreadyExistException, CatalogException { + checkArgument( + !isNullOrWhitespaceOnly(databaseName), "databaseName cannot be null or empty"); + checkNotNull(database, "database cannot be null"); + + Map<String, String> properties = database.getProperties(); + + String dbLocationUri = properties.remove(SqlCreateHiveDatabase.DATABASE_LOCATION_URI); + + Database hiveDatabase = + new Database(databaseName, database.getComment(), dbLocationUri, properties); + + try { + client.createDatabase(hiveDatabase); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new DatabaseAlreadyExistException(getName(), hiveDatabase.getName()); + } + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to create database %s", hiveDatabase.getName()), e); + } + } + + @Override + public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) + throws DatabaseNotExistException, DatabaseNotEmptyException, CatalogException { + try { + client.dropDatabase(name, true, ignoreIfNotExists, cascade); + } catch (NoSuchObjectException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } catch (InvalidOperationException e) { + throw new DatabaseNotEmptyException(getName(), name); + } catch (TException e) { + throw new HoodieCatalogException(String.format("Failed to drop database %s", name), e); + } + } + + @Override + public void alterDatabase( + String databaseName, CatalogDatabase newDatabase, boolean ignoreIfNotExists) + throws DatabaseNotExistException, CatalogException { + checkArgument( + !isNullOrWhitespaceOnly(databaseName), "databaseName cannot be null or empty"); + checkNotNull(newDatabase, "newDatabase cannot be null"); + + // client.alterDatabase doesn't throw any exception if there is no existing database + Database hiveDB; + try { + hiveDB = getHiveDatabase(databaseName); + } catch (DatabaseNotExistException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), databaseName); + } + + return; + } + + try { + client.alterDatabase(databaseName, alterDatabase(hiveDB, newDatabase)); + } catch (TException e) { + throw new HoodieCatalogException( + String.format("Failed to alter database %s", databaseName), e); + } + } + + private static Database alterDatabase(Database hiveDB, CatalogDatabase newDatabase) { + Map<String, String> newParams = newDatabase.getProperties(); + String opStr = newParams.remove(ALTER_DATABASE_OP); + if (opStr == null) { + // by default is to alter db properties + opStr = SqlAlterHiveDatabase.AlterHiveDatabaseOp.CHANGE_PROPS.name(); + } + String newLocation = newParams.remove(SqlCreateHiveDatabase.DATABASE_LOCATION_URI); + SqlAlterHiveDatabase.AlterHiveDatabaseOp op = + SqlAlterHiveDatabase.AlterHiveDatabaseOp.valueOf(opStr); + switch (op) { + case CHANGE_PROPS: + hiveDB.setParameters(newParams); + break; + case CHANGE_LOCATION: + hiveDB.setLocationUri(newLocation); + break; + case CHANGE_OWNER: + String ownerName = newParams.remove(DATABASE_OWNER_NAME); + String ownerType = newParams.remove(DATABASE_OWNER_TYPE); + hiveDB.setOwnerName(ownerName); + switch (ownerType) { + case SqlAlterHiveDatabaseOwner.ROLE_OWNER: + hiveDB.setOwnerType(PrincipalType.ROLE); + break; + case SqlAlterHiveDatabaseOwner.USER_OWNER: + hiveDB.setOwnerType(PrincipalType.USER); + break; + default: + throw new CatalogException("Unsupported database owner type: " + ownerType); + } + break; + default: + throw new CatalogException("Unsupported alter database op:" + opStr); + } + // is_generic is deprecated, remove it + if (hiveDB.getParameters() != null) { + hiveDB.getParameters().remove(CatalogPropertiesUtil.IS_GENERIC); + } + return hiveDB; + } + + // ------ tables ------ + + private Table checkHoodieTable(Table hiveTable) { + if (!hiveTable.getParameters().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi") + && !hiveTable.getParameters().getOrDefault(SPARK_SOURCE_PROVIDER, "").equalsIgnoreCase("hudi")) { + throw new HoodieCatalogException(String.format("the %s is not hoodie table", hiveTable.getTableName())); + } + return hiveTable; + } + + private boolean isFlinkTable(Table hiveTable) { + return hiveTable.getParameters().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi"); + } + + private org.apache.avro.Schema getLatestTableSchema(String path) { + if (StringUtils.isNullOrEmpty(path) || !StreamerUtil.tableExists(path, hiveConf)) { + return null; + } + + try { + HoodieTableMetaClient metaClient = StreamerUtil.createMetaClient(path, hiveConf); + return new TableSchemaResolver(metaClient).getTableAvroSchema(false); + } catch (Exception e) { + LOG.warn("Error while resolving the latest table schema", e); + } + return null; + } + + @VisibleForTesting + public Table getHiveTable(ObjectPath tablePath) throws TableNotExistException { + try { + Table hiveTable = client.getTable(tablePath.getDatabaseName(), tablePath.getObjectName()); + return checkHoodieTable(hiveTable); + } catch (NoSuchObjectException e) { + throw new TableNotExistException(getName(), tablePath); + } catch (TException e) { + throw new HoodieCatalogException(String.format("Failed to get table %s from Hive metastore", tablePath.getObjectName())); + } + } + + private Table translateSparkTable2Flink(ObjectPath tablePath, Table hiveTable) { + if (!isFlinkTable(hiveTable)) { + try { + Map<String, String> parameters = hiveTable.getParameters(); + parameters.putAll(TableOptionProperties.translateSparkTableProperties2Flink(hiveTable)); + String path = hiveTable.getSd().getLocation(); + parameters.put(PATH.key(), path); + if (!parameters.containsKey(FlinkOptions.HIVE_STYLE_PARTITIONING.key())) { + Path hoodieTablePath = new Path(path); + boolean hiveStyle = Arrays.stream(FSUtils.getFs(hoodieTablePath, hiveConf).listStatus(hoodieTablePath)) + .map(fileStatus -> fileStatus.getPath().getName()) + .filter(f -> !f.equals(".hoodie") && !f.equals("default")) + .anyMatch(FilePathUtils::hiveStylePartitionMath); + parameters.put(FlinkOptions.HIVE_STYLE_PARTITIONING.key(), String.valueOf(hiveStyle)); + } + client.alter_table(tablePath.getDatabaseName(), tablePath.getObjectName(), hiveTable); + } catch (Exception e) { + throw new HoodieCatalogException("Failed to update table schema", e); + } + } + return hiveTable; + } + + @Override + public CatalogBaseTable getTable(ObjectPath tablePath) throws TableNotExistException, CatalogException { + checkNotNull(tablePath, "tablePath cannot be null"); + Table hiveTable = getHiveTable(tablePath); + hiveTable = translateSparkTable2Flink(tablePath, hiveTable); + String path = hiveTable.getSd().getLocation(); + Map<String, String> parameters = hiveTable.getParameters(); + Schema latestTableSchema = getLatestTableSchema(path); + org.apache.flink.table.api.Schema schema; + if (latestTableSchema != null) { + org.apache.flink.table.api.Schema.Builder builder = org.apache.flink.table.api.Schema.newBuilder() + .fromRowDataType(AvroSchemaConverter.convertToDataType(latestTableSchema)); + String pkConstraintName = parameters.get(PK_CONSTRAINT_NAME); + if (!StringUtils.isNullOrEmpty(pkConstraintName)) { + builder.primaryKeyNamed(pkConstraintName, StringUtils.split(parameters.get(PK_COLUMNS), ",")); + } + schema = builder.build(); + } else { + LOG.warn("{} does not have any hoodie schema, and use hive table to covert the catalogBaseTable", tablePath); + schema = TableOptionProperties.convertTableSchema(hiveTable); + } + return CatalogTable.of(schema, parameters.get(COMMENT), + TableOptionProperties.getFieldNames(hiveTable.getPartitionKeys()), parameters); + } + + @Override + public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) + throws TableAlreadyExistException, DatabaseNotExistException, CatalogException { + checkNotNull(tablePath, "tablePath cannot be null"); + checkNotNull(table, "table cannot be null"); + + if (!databaseExists(tablePath.getDatabaseName())) { + throw new DatabaseNotExistException(getName(), tablePath.getDatabaseName()); + } + + if (!table.getOptions().getOrDefault(CONNECTOR.key(), "").equalsIgnoreCase("hudi")) { + throw new HoodieCatalogException(String.format("The %s is not hoodie table", tablePath.getObjectName())); + } + + if (table instanceof CatalogView) { + throw new HoodieCatalogException("Hoodie catalog does not support to CREATE VIEW."); + } + + try { + boolean isMorTable = table.getOptions().getOrDefault(FlinkOptions.TABLE_TYPE.key(), + FlinkOptions.TABLE_TYPE.defaultValue()).equalsIgnoreCase(FlinkOptions.TABLE_TYPE_MERGE_ON_READ); Review Comment: `getOrDefault(FlinkOptions.TABLE_TYPE.key(), FlinkOptions.TABLE_TYPE.defaultValue())` -> use get directly is ok if the option already has a default value. ########## hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/catalog/TestHoodieHiveCatalog.java: ########## @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.catalog; + +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.exception.HoodieCatalogException; + +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogTableImpl; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.hadoop.hive.metastore.api.Table; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Test cases for {@link HoodieHiveCatalog}. + */ +public class TestHoodieHiveCatalog { + TableSchema schema = + TableSchema.builder() + .field("name", DataTypes.STRING()) + .field("age", DataTypes.INT()) + .build(); + private static HoodieHiveCatalog hoodieCatalog; + private final ObjectPath tablePath = new ObjectPath("default", "test"); + + @BeforeAll + public static void createCatalog() { + hoodieCatalog = TestHoodieCatalogUtils.createHiveCatalog(); + hoodieCatalog.open(); + } + + @AfterEach + public void dropTable() throws TableNotExistException { + hoodieCatalog.dropTable(tablePath, true); + } + + @AfterAll + public static void closeCatalog() { + if (hoodieCatalog != null) { + hoodieCatalog.close(); + } + } + + @Test + public void testCreateAndGetHoodieMORTable() throws Exception { + Map<String, String> originOptions = new HashMap<>(); + originOptions.put(FactoryUtil.CONNECTOR.key(), "hudi"); + originOptions.put(FlinkOptions.TABLE_TYPE.key(), FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + CatalogTable table = + new CatalogTableImpl(schema, originOptions, "hudi table"); + hoodieCatalog.createTable(tablePath, table, false); + CatalogBaseTable table1 = hoodieCatalog.getTable(tablePath); + assertEquals(table1.getOptions().get(CONNECTOR.key()), "hudi"); + assertEquals(table1.getOptions().get(FlinkOptions.TABLE_TYPE.key()), FlinkOptions.TABLE_TYPE_MERGE_ON_READ); + } + + @Test + public void testCreateAndGetHoodieTable() throws Exception { + Map<String, String> originOptions = + Collections.singletonMap(FactoryUtil.CONNECTOR.key(), "hudi"); + CatalogTable table = + new CatalogTableImpl(schema, originOptions, "hudi table"); + hoodieCatalog.createTable(tablePath, table, false); + CatalogBaseTable table1 = hoodieCatalog.getTable(tablePath); + assertEquals(table1.getOptions().get(CONNECTOR.key()), "hudi"); Review Comment: We may need to inspect key options like partition keys, primary keys, preCombine keys, and custom keys. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
