[GitHub] [incubator-hudi] yanghua commented on a change in pull request #1554: [HUDI-704]Add test for RepairsCommand

2020-05-06 Thread GitBox


yanghua commented on a change in pull request #1554:
URL: https://github.com/apache/incubator-hudi/pull/1554#discussion_r420784920



##
File path: 
hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java
##
@@ -64,19 +69,35 @@ public String deduplicate(
   @CliOption(key = {"repairedOutputPath"}, help = "Location to place the 
repaired files",
   mandatory = true) final String repairedOutputPath,
   @CliOption(key = {"sparkProperties"}, help = "Spark Properties File 
Path",
-  mandatory = true) final String sparkPropertiesPath)
+  unspecifiedDefaultValue = "") String sparkPropertiesPath,
+  @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = 
"Spark Master ") String master,

Review comment:
   `"Spark Master "` -> `"Spark Master"`?

##
File path: 
hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java
##
@@ -64,19 +69,35 @@ public String deduplicate(
   @CliOption(key = {"repairedOutputPath"}, help = "Location to place the 
repaired files",
   mandatory = true) final String repairedOutputPath,
   @CliOption(key = {"sparkProperties"}, help = "Spark Properties File 
Path",
-  mandatory = true) final String sparkPropertiesPath)
+  unspecifiedDefaultValue = "") String sparkPropertiesPath,
+  @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = 
"Spark Master ") String master,
+  @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G",
+  help = "Spark executor memory") final String sparkMemory,
+  @CliOption(key = {"dryrun"},
+  help = "Should we actually remove duplicates or just run and store 
result to repairedOutputPath",
+  unspecifiedDefaultValue = "true") final boolean dryRun)
   throws Exception {
+if (StringUtils.isNullOrEmpty(sparkPropertiesPath)) {
+  sparkPropertiesPath =
+  
Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
+}
+
 SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
-sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), 
duplicatedPartitionPath, repairedOutputPath,
-HoodieCLI.getTableMetaClient().getBasePath());
+sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), 
master, sparkMemory,

Review comment:
   The same suggestion, we should try to define a data structure? We can 
refactor it later.

##
File path: hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java
##
@@ -73,8 +73,8 @@ public static void main(String[] args) throws Exception {
 returnCode = rollback(jsc, args[1], args[2]);
 break;
   case DEDUPLICATE:
-assert (args.length == 4);
-returnCode = deduplicatePartitionPath(jsc, args[1], args[2], args[3]);
+assert (args.length == 7);
+returnCode = deduplicatePartitionPath(jsc, args[3], args[4], args[5], 
args[6]);

Review comment:
   IMHO, we also need to refactor the arg parse. But not in this PR.

##
File path: 
hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java
##
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.cli.integ;
+
+import org.apache.avro.Schema;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.cli.AbstractShellIntegrationTest;
+import org.apache.hudi.cli.HoodieCLI;
+import org.apache.hudi.cli.commands.RepairsCommand;
+import org.apache.hudi.cli.commands.TableCommand;
+import org.apache.hudi.common.HoodieClientTestUtils;
+import org.apache.hudi.common.HoodieTestDataGenerator;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.model.HoodieBaseFile;
+import org.apache.hudi.common.model.HoodieLogFile;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
+import 

[GitHub] [incubator-hudi] yanghua commented on a change in pull request #1554: [HUDI-704]Add test for RepairsCommand

2020-05-01 Thread GitBox


yanghua commented on a change in pull request #1554:
URL: https://github.com/apache/incubator-hudi/pull/1554#discussion_r418430527



##
File path: 
hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java
##
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.cli.commands;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.cli.AbstractShellIntegrationTest;
+import org.apache.hudi.cli.HoodieCLI;
+import org.apache.hudi.cli.HoodiePrintHelper;
+import org.apache.hudi.cli.HoodieTableHeaderFields;
+import org.apache.hudi.cli.common.HoodieTestCommitMetadataGenerator;
+import org.apache.hudi.common.HoodieTestDataGenerator;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.springframework.shell.core.CommandResult;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.stream.Collectors;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Test class for {@link RepairsCommand}.
+ */
+public class TestRepairsCommand extends AbstractShellIntegrationTest {
+
+  private String tablePath;
+
+  @Before
+  public void init() throws IOException {
+String tableName = "test_table";
+tablePath = basePath + File.separator + tableName;
+
+// Create table and connect
+new TableCommand().createTable(
+tablePath, "test_table", HoodieTableType.COPY_ON_WRITE.name(),
+"", TimelineLayoutVersion.VERSION_1, 
"org.apache.hudi.common.model.HoodieAvroPayload");
+  }
+
+  /**
+   * Test case for dry run 'repair addpartitionmeta'.
+   */
+  @Test
+  public void testAddPartitionMetaWithDryRun() throws IOException {
+// create commit instant
+new File(tablePath + "/.hoodie/100.commit").createNewFile();
+
+// create partition path
+String partition1 = tablePath + File.separator + 
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH;
+String partition2 = tablePath + File.separator + 
HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH;
+String partition3 = tablePath + File.separator + 
HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH;
+fs.mkdirs(new Path(partition1));
+fs.mkdirs(new Path(partition2));
+fs.mkdirs(new Path(partition3));
+
+// default is dry run.
+CommandResult cr = getShell().executeCommand("repair addpartitionmeta");
+Assert.assertTrue(cr.isSuccess());
+
+// expected all 'No'.
+String[][] rows = FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, 
tablePath)
+.stream()
+.map(partition -> new String[]{partition, "No", "None"})
+.toArray(String[][]::new);
+String expected = HoodiePrintHelper.print(new String[] 
{HoodieTableHeaderFields.HEADER_PARTITION_PATH,
+HoodieTableHeaderFields.HEADER_METADATA_PRESENT, 
HoodieTableHeaderFields.HEADER_REPAIR_ACTION}, rows);
+
+Assert.assertEquals(expected, cr.getResult().toString());
+  }
+
+  /**
+   * Test case for real run 'repair addpartitionmeta'.
+   */
+  @Test
+  public void testAddPartitionMetaWithRealRun() throws IOException {
+// create commit instant
+new File(tablePath + "/.hoodie/100.commit").createNewFile();

Review comment:
   ditto

##
File path: 
hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java
##
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ *