xushiyan commented on a change in pull request #1404: [HUDI-344] Improve
exporter tests
URL: https://github.com/apache/incubator-hudi/pull/1404#discussion_r392603554
##########
File path:
hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieSnapshotExporter.java
##########
@@ -18,205 +18,144 @@
package org.apache.hudi.utilities;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hudi.DataSourceWriteOptions;
-import org.apache.hudi.common.HoodieCommonTestHarness;
+import org.apache.hudi.client.HoodieWriteClient;
+import org.apache.hudi.common.HoodieClientTestHarness;
import org.apache.hudi.common.HoodieTestDataGenerator;
-import org.apache.hudi.common.model.HoodieTestUtils;
-import org.apache.hudi.common.util.FSUtils;
+import org.apache.hudi.common.model.HoodieAvroPayload;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SaveMode;
-import org.apache.spark.sql.SparkSession;
+import org.apache.hudi.index.HoodieIndex.IndexType;
+import org.apache.hudi.utilities.HoodieSnapshotExporter.Config;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.SparkSession;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-import java.io.File;
import java.io.IOException;
-import java.util.HashMap;
+import java.util.Arrays;
import java.util.List;
-import java.util.Map;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
-public class TestHoodieSnapshotExporter extends HoodieCommonTestHarness {
- private static String TEST_WRITE_TOKEN = "1-0-1";
+public class TestHoodieSnapshotExporter extends HoodieClientTestHarness {
- private SparkSession spark = null;
- private HoodieTestDataGenerator dataGen = null;
- private String outputPath = null;
- private String rootPath = null;
- private FileSystem fs = null;
- private Map commonOpts;
- private HoodieSnapshotExporter.Config cfg;
- private JavaSparkContext jsc = null;
+ private static final Logger LOG =
LogManager.getLogger(TestHoodieSnapshotExporter.class);
+ private static final int NUM_RECORDS = 100;
+ private static final String COMMIT_TIME = "20200101000000";
+ private static final String PARTITION_PATH = "2020/01/01";
+ private static final String TABLE_NAME = "testing";
+ private String sourcePath;
+ private String targetPath;
@Before
- public void initialize() throws IOException {
- spark = SparkSession.builder()
- .appName("Hoodie Datasource test")
- .master("local[2]")
- .config("spark.serializer",
"org.apache.spark.serializer.KryoSerializer")
- .getOrCreate();
- jsc = new JavaSparkContext(spark.sparkContext());
- dataGen = new HoodieTestDataGenerator();
- folder.create();
- basePath = folder.getRoot().getAbsolutePath();
- fs = FSUtils.getFs(basePath, spark.sparkContext().hadoopConfiguration());
- commonOpts = new HashMap();
-
- commonOpts.put("hoodie.insert.shuffle.parallelism", "4");
- commonOpts.put("hoodie.upsert.shuffle.parallelism", "4");
- commonOpts.put(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(),
"_row_key");
- commonOpts.put(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(),
"partition");
- commonOpts.put(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(),
"timestamp");
- commonOpts.put(HoodieWriteConfig.TABLE_NAME, "hoodie_test");
-
-
- cfg = new HoodieSnapshotExporter.Config();
-
- cfg.sourceBasePath = basePath;
- cfg.targetOutputPath = outputPath = basePath + "/target";
- cfg.outputFormat = "json";
- cfg.outputPartitionField = "partition";
-
+ public void setUp() throws Exception {
+ initSparkContexts();
+ initDFS();
+ dataGen = new HoodieTestDataGenerator(new String[] {PARTITION_PATH});
+
+ // Initialize test data dirs
+ sourcePath = dfsBasePath + "/source/";
+ targetPath = dfsBasePath + "/target/";
+ dfs.mkdirs(new Path(sourcePath));
+ dfs.mkdirs(new Path(targetPath));
+ HoodieTableMetaClient
+ .initTableType(jsc.hadoopConfiguration(), sourcePath,
HoodieTableType.COPY_ON_WRITE, TABLE_NAME,
+ HoodieAvroPayload.class.getName());
+
+ // Prepare data as source Hudi dataset
+ HoodieWriteConfig cfg = getHoodieWriteConfig(sourcePath);
+ HoodieWriteClient hdfsWriteClient = new HoodieWriteClient(jsc, cfg);
+ hdfsWriteClient.startCommitWithTime(COMMIT_TIME);
+ List<HoodieRecord> records = dataGen.generateInserts(COMMIT_TIME,
NUM_RECORDS);
+ JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
+ hdfsWriteClient.bulkInsert(recordsRDD, COMMIT_TIME);
+ hdfsWriteClient.close();
+
+ RemoteIterator<LocatedFileStatus> itr = dfs.listFiles(new
Path(sourcePath), true);
+ while (itr.hasNext()) {
+ LOG.info(">>> Prepared test file: " + itr.next().getPath());
+ }
}
@After
- public void cleanup() {
- if (spark != null) {
- spark.stop();
- }
+ public void tearDown() throws Exception {
+ cleanupSparkContexts();
+ cleanupDFS();
+ cleanupTestDataGenerator();
}
- @Test
- public void testSnapshotExporter() throws IOException {
- // Insert Operation
- List<String> records =
DataSourceTestUtils.convertToStringList(dataGen.generateInserts("000", 100));
- Dataset<Row> inputDF = spark.read().json(new
JavaSparkContext(spark.sparkContext()).parallelize(records, 2));
- inputDF.write().format("hudi")
- .options(commonOpts)
- .option(DataSourceWriteOptions.OPERATION_OPT_KEY(),
DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())
- .mode(SaveMode.Overwrite)
- .save(basePath);
- long sourceCount = inputDF.count();
-
- HoodieSnapshotExporter hoodieSnapshotExporter = new
HoodieSnapshotExporter();
- hoodieSnapshotExporter.export(spark, cfg);
-
- long targetCount = spark.read().json(outputPath).count();
-
- assertTrue(sourceCount == targetCount);
-
- // Test Invalid OutputFormat
- cfg.outputFormat = "foo";
- int isError = hoodieSnapshotExporter.export(spark, cfg);
- assertTrue(isError == -1);
+ private HoodieWriteConfig getHoodieWriteConfig(String basePath) {
+ return HoodieWriteConfig.newBuilder()
+ .withPath(basePath)
+ .withEmbeddedTimelineServerEnabled(false)
+ .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
+ .withParallelism(2, 2)
+ .withBulkInsertParallelism(2)
+ .forTable(TABLE_NAME)
+
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
+ .build();
}
- // for testEmptySnapshotCopy
- public void init() throws IOException {
- TemporaryFolder folder = new TemporaryFolder();
- folder.create();
- rootPath = "file://" + folder.getRoot().getAbsolutePath();
- basePath = rootPath + "/" + HoodieTestUtils.RAW_TRIPS_TEST_NAME;
- outputPath = rootPath + "/output";
-
- final Configuration hadoopConf = HoodieTestUtils.getDefaultHadoopConf();
- fs = FSUtils.getFs(basePath, hadoopConf);
- HoodieTestUtils.init(hadoopConf, basePath);
+ @Test
+ public void testExportAsParquet() throws IOException {
+ HoodieSnapshotExporter.Config cfg = new Config();
+ cfg.sourceBasePath = sourcePath;
+ cfg.targetOutputPath = targetPath;
+ cfg.outputFormat = "parquet";
+ new
HoodieSnapshotExporter().export(SparkSession.builder().config(jsc.getConf()).getOrCreate(),
cfg);
+ assertEquals(NUM_RECORDS, sqlContext.read().parquet(targetPath).count());
+ assertTrue(dfs.exists(new Path(targetPath + "/_SUCCESS")));
Review comment:
@leesf Exporter seems has removed that logic for hudi case, though non-hudi
case will do it automatically via spark. I'll ensure both create the success
tag.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services