xushiyan commented on a change in pull request #4175:
URL: https://github.com/apache/hudi/pull/4175#discussion_r824606045



##########
File path: 
hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java
##########
@@ -1115,31 +1036,40 @@ public void testTypeConverter(String syncMode) throws 
Exception {
   @ParameterizedTest
   @MethodSource("syncMode")
   public void testSyncWithoutDiffs(String syncMode) throws Exception {
-    hiveSyncConfig.syncMode = syncMode;
-    hiveSyncConfig.isConditionalSync = true;
-    HiveTestUtil.hiveSyncConfig.batchSyncNum = 2;
-    String tableName = HiveTestUtil.hiveSyncConfig.tableName + 
HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
+    String tableName = HiveTestUtil.TABLE_NAME + 
HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
+    hiveSyncProps.setProperty(HiveSyncConfig.HIVE_SYNC_MODE.key(), syncMode);
+    hiveSyncProps.setProperty(HiveSyncConfig.META_SYNC_CONDITIONAL_SYNC.key(), 
"true");
 
     String commitTime0 = "100";
     String commitTime1 = "101";
     String commitTime2 = "102";
     HiveTestUtil.createMORTable(commitTime0, commitTime1, 2, true, true);
 
-    HoodieHiveClient hiveClient =
-        new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, 
HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
-
-    HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, 
HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
-    tool.syncHoodieTable();
+    reinitHiveSyncClient();
+    reSyncHiveTable();
 
     assertTrue(hiveClient.doesTableExist(tableName));
     assertEquals(commitTime1, 
hiveClient.getLastCommitTimeSynced(tableName).get());
 
     HiveTestUtil.addMORPartitions(0, true, true, true, 
ZonedDateTime.now().plusDays(2), commitTime1, commitTime2);
 
-    tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, 
HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
-    tool.syncHoodieTable();
-    hiveClient = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, 
HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
+    reSyncHiveTable();
     assertEquals(commitTime1, 
hiveClient.getLastCommitTimeSynced(tableName).get());
   }
 
+  private void reSyncHiveTable() {
+    hiveSyncTool.syncHoodieTable();
+    // we need renew the hiveclient after tool.syncHoodieTable(), because it 
will close hive
+    // session, then lead to connection retry, we can see there is a exception 
at log.
+    reinitHiveSyncClient();
+  }
+
+  private void reinitHiveSyncClient() {
+    hiveSyncTool = new HiveSyncTool(hiveSyncProps, HiveTestUtil.getHiveConf(), 
fileSystem);
+    hiveClient = hiveSyncTool.hoodieHiveClient;
+  }
+
+  private int getPartitionFieldSize() {
+    return 
hiveSyncProps.getString(HiveSyncConfig.META_SYNC_PARTITION_FIELDS.key()).split(",").length;
+  }

Review comment:
       optional but i'd prefer to have static helpers for good portability.

##########
File path: 
hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java
##########
@@ -205,7 +206,8 @@ public static void main(String[] args) {
       cmd.usage();
       System.exit(1);
     }
-    FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
-    new DLASyncTool(Utils.configToProperties(cfg), fs).syncHoodieTable();
+    Configuration hadoopConf = new Configuration();
+    FileSystem fs = FSUtils.getFs(cfg.basePath, hadoopConf);
+    new DLASyncTool(Utils.configToProperties(cfg), hadoopConf, 
fs).syncHoodieTable();

Review comment:
       `FSUtils.getFS()` alters hadoopConf internally and the caller has no 
idea about it. This looks like passing an empty hadoopConf to it, though the 
conf was updated. So suggest passing `fs.getConf()` to DLASyncTool to avoid the 
confusion.

##########
File path: 
hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncTool.java
##########
@@ -17,17 +17,31 @@
 
 package org.apache.hudi.sync.common;
 
+import org.apache.hudi.common.config.TypedProperties;
+
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 
 import java.util.Properties;
 
+/**
+ * Base class to sync Hudi meta data with Metastores to make
+ * Hudi table queryable through external systems.
+ */
 public abstract class AbstractSyncTool {
-  protected Properties props;
-  protected FileSystem fileSystem;
+  protected final Configuration conf;
+  protected final FileSystem fs;
+  protected TypedProperties props;
 
-  public AbstractSyncTool(Properties props, FileSystem fileSystem) {
+  public AbstractSyncTool(TypedProperties props, Configuration conf, 
FileSystem fs) {
     this.props = props;
-    this.fileSystem = fileSystem;
+    this.conf = conf;
+    this.fs = fs;
+  }
+
+  @Deprecated

Review comment:
       ```suggestion
     /**
      * @deprecated <some notes>
      */
     @Deprecated
   ```

##########
File path: 
hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/SyncUtilHelpers.java
##########
@@ -0,0 +1,67 @@
+package org.apache.hudi.sync.common.util;
+
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.ReflectionUtils;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.sync.common.AbstractSyncTool;
+import org.apache.hudi.sync.common.HoodieSyncConfig;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+
+import java.util.Properties;
+
+/**
+ * Helper class for syncing Hudi commit data with external metastores.
+ */
+public class SyncUtilHelpers {
+
+  /**
+   * Create an instance of an implementation of {@link AbstractSyncTool} that 
will sync all the relevant meta information
+   * with an external metastore such as Hive etc. to ensure Hoodie tables can 
be queried or read via external systems.
+   *
+   * @param metaSyncClass  The class that implements the sync of the metadata.
+   * @param props          property map.
+   * @param hadoopConfig   Hadoop confs.
+   * @param fs             Filesystem used.
+   * @param targetBasePath The target base path that contains the hoodie table.
+   * @param baseFileFormat The file format used by the hoodie table (defaults 
to PARQUET).
+   * @return
+   */
+  public static void createAndSyncHoodieMeta(String metaSyncClass,
+                                             TypedProperties props,
+                                             Configuration hadoopConfig,
+                                             FileSystem fs,
+                                             String targetBasePath,
+                                             String baseFileFormat) {
+    try {
+      createMetaSyncClass(metaSyncClass, props, hadoopConfig, fs, 
targetBasePath, baseFileFormat).syncHoodieTable();
+    } catch (Throwable e) {
+      throw new HoodieException("Could not sync using the meta sync class " + 
metaSyncClass, e);
+    }
+  }
+
+  static AbstractSyncTool createMetaSyncClass(String metaSyncClass,

Review comment:
       IIUC "meta sync" and "hudi sync" or "hoodie sync" mean the same thing. 
Since we call the module `hudi-sync` then we should just stick to it and call 
`hoodieSync` throughout the codebase. no `metaSync` at all. WDYT?

##########
File path: 
hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java
##########
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.sync.common;
+
+import org.apache.hudi.common.config.ConfigProperty;
+import org.apache.hudi.common.config.HoodieConfig;
+import org.apache.hudi.common.config.HoodieMetadataConfig;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.table.HoodieTableConfig;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
+
+import com.beust.jcommander.Parameter;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Function;
+
+/**
+ * Configs needed to sync data into external meta stores, catalogs, etc.
+ */
+public class HoodieSyncConfig extends HoodieConfig {

Review comment:
       like other config classes, would be helpful to have builder support and 
`getProps()` API.

##########
File path: 
hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java
##########
@@ -240,41 +240,10 @@ public HiveSyncConfig(TypedProperties props) {
     this.syncAsSparkDataSourceTable = 
getBooleanOrDefault(HIVE_SYNC_AS_DATA_SOURCE_TABLE);
     this.sparkSchemaLengthThreshold = 
getIntOrDefault(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD);
     this.createManagedTable = getBooleanOrDefault(HIVE_CREATE_MANAGED_TABLE);
-    this.bucketSpec = props.getString(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), null);
+    this.bucketSpec = getStringOrDefault(HIVE_SYNC_BUCKET_SYNC_SPEC);

Review comment:
       @rmahindra123 Not sure why this was particularly set to `null`. seems 
like the default `""` is ok.

##########
File path: 
hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java
##########
@@ -240,41 +240,10 @@ public HiveSyncConfig(TypedProperties props) {
     this.syncAsSparkDataSourceTable = 
getBooleanOrDefault(HIVE_SYNC_AS_DATA_SOURCE_TABLE);
     this.sparkSchemaLengthThreshold = 
getIntOrDefault(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD);
     this.createManagedTable = getBooleanOrDefault(HIVE_CREATE_MANAGED_TABLE);
-    this.bucketSpec = props.getString(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), null);
+    this.bucketSpec = getStringOrDefault(HIVE_SYNC_BUCKET_SYNC_SPEC);
     this.syncComment = getBooleanOrDefault(HIVE_SYNC_COMMENT);
   }
 
-  // enhance the similar function in child class
-  public static HiveSyncConfig copy(HiveSyncConfig cfg) {

Review comment:
       this is not used.

##########
File path: 
hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncTool.java
##########
@@ -41,7 +41,7 @@ public AbstractSyncTool(TypedProperties props, Configuration 
conf, FileSystem fs
 
   @Deprecated
   public AbstractSyncTool(Properties props, FileSystem fileSystem) {
-    this(new TypedProperties(props), new Configuration(), fileSystem);
+    this(new TypedProperties(props), fileSystem.getConf(), fileSystem);

Review comment:
       @rmahindra123 if user provides an fs, shall we always make use of its 
conf? Actually not sure why the new API allow passing a different conf rather 
than use the one from `fileSystem`

##########
File path: 
hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/replication/GlobalHiveSyncConfig.java
##########
@@ -18,15 +18,24 @@
 
 package org.apache.hudi.hive.replication;
 
-import com.beust.jcommander.Parameter;
+import org.apache.hudi.common.config.TypedProperties;
 import org.apache.hudi.hive.HiveSyncConfig;
 
+import com.beust.jcommander.Parameter;
+
 public class GlobalHiveSyncConfig extends HiveSyncConfig {
   @Parameter(names = {"--replicated-timestamp"}, description = "Add globally 
replicated timestamp to enable consistent reads across clusters")
   public String globallyReplicatedTimeStamp;
 
+  public GlobalHiveSyncConfig() {
+  }
+
+  public GlobalHiveSyncConfig(TypedProperties props) {
+    super(props);
+  }
+
   public static GlobalHiveSyncConfig copy(GlobalHiveSyncConfig cfg) {
-    GlobalHiveSyncConfig newConfig = new GlobalHiveSyncConfig();
+    GlobalHiveSyncConfig newConfig = new GlobalHiveSyncConfig(cfg.getProps());
     newConfig.basePath = cfg.basePath;
     newConfig.assumeDatePartitioning = cfg.assumeDatePartitioning;

Review comment:
       the existing copy does not seem to copy everything, like `props`. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to