[GitHub] [druid] maytasm commented on a change in pull request #9714: More Hadoop integration tests

GitBox Thu, 16 Apr 2020 17:59:35 -0700

maytasm commented on a change in pull request #9714: More Hadoop integration 
tests
URL: https://github.com/apache/druid/pull/9714#discussion_r409922084


 ##########
 File path: 
integration-tests/src/test/java/org/apache/druid/tests/hadoop/ITHadoopIndexTest.java
 ##########
 @@ -19,87 +19,147 @@
 
 package org.apache.druid.tests.hadoop;
 
-import com.google.inject.Inject;
+import com.google.common.collect.ImmutableList;
+import org.apache.druid.indexer.partitions.DimensionBasedPartitionsSpec;
+import org.apache.druid.indexer.partitions.HashedPartitionsSpec;
+import org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.logger.Logger;
-import org.apache.druid.testing.IntegrationTestingConfig;
 import org.apache.druid.testing.guice.DruidTestModuleFactory;
-import org.apache.druid.testing.utils.ITRetryUtil;
 import org.apache.druid.tests.TestNGGroup;
-import org.apache.druid.tests.indexer.AbstractIndexerTest;
-import org.testng.annotations.AfterClass;
-import org.testng.annotations.BeforeClass;
+import org.apache.druid.tests.indexer.AbstractITBatchIndexTest;
+import org.testng.annotations.DataProvider;
 import org.testng.annotations.Guice;
 import org.testng.annotations.Test;
 
+import java.io.Closeable;
+import java.util.UUID;
+import java.util.function.Function;
+
+/**
+ * IMPORTANT:
+ * To run this test, you must:
+ * 1) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and 
wikipedia_index_data3.json
+ *    located in integration-tests/src/test/resources/data/batch_index/json to 
your HDFS at the location set in step 1.
+ *    If using the Docker-based Hadoop container, this is automatically done 
by the integration tests.
+ * 2) Provide -Doverride.config.path=<PATH_TO_FILE> with HDFS configs set. See
+ *    integration-tests/docker/environment-configs/override-examples/hdfs for 
env vars to provide.
+ * 3) Run the test with -Dstart.hadoop.docker=true 
-Dextra.datasource.name.suffix='' in the mvn command
+ */
 @Test(groups = TestNGGroup.HADOOP_INDEX)
 @Guice(moduleFactory = DruidTestModuleFactory.class)
-public class ITHadoopIndexTest extends AbstractIndexerTest
+public class ITHadoopIndexTest extends AbstractITBatchIndexTest
 {
   private static final Logger LOG = new Logger(ITHadoopIndexTest.class);
+
   private static final String BATCH_TASK = "/hadoop/batch_hadoop_indexer.json";
   private static final String BATCH_QUERIES_RESOURCE = 
"/hadoop/batch_hadoop_queries.json";
   private static final String BATCH_DATASOURCE = "batchHadoop";
-  private boolean dataLoaded = false;
 
-  @Inject
-  private IntegrationTestingConfig config;
+  private static final String INDEX_TASK = 
"/hadoop/wikipedia_hadoop_index_task.json";
+  private static final String INDEX_QUERIES_RESOURCE = 
"/indexer/wikipedia_index_queries.json";
+  private static final String INDEX_DATASOURCE = "wikipedia_hadoop_index_test";
 
-  @BeforeClass
-  public void beforeClass()
-  {
-    loadData(config.getProperty("hadoopTestDir") + "/batchHadoop1");
-    dataLoaded = true;
-  }
+  private static final String REINDEX_TASK = 
"/hadoop/wikipedia_hadoop_reindex_task.json";
+  private static final String REINDEX_QUERIES_RESOURCE = 
"/indexer/wikipedia_reindex_queries.json";
+  private static final String REINDEX_DATASOURCE = 
"wikipedia_hadoop_reindex_test";
 
-  @Test
-  public void testHadoopIndex() throws Exception
+  @DataProvider
+  public static Object[][] resources()
   {
-    queryHelper.testQueriesFromFile(BATCH_QUERIES_RESOURCE, 2);
+    return new Object[][]{
+        {new HashedPartitionsSpec(3, null, null)},
+        {new HashedPartitionsSpec(null, 3, ImmutableList.of("page"))},
+        {new HashedPartitionsSpec(null, 3, ImmutableList.of("page", "user"))},
+        {new SingleDimensionPartitionsSpec(1000, null, null, false)},
+        {new SingleDimensionPartitionsSpec(1000, null, "page", false)},
+        {new SingleDimensionPartitionsSpec(1000, null, null, true)},
+
+        //{new HashedPartitionsSpec(null, 3, null)} // this results in a bug 
where the segments have 0 rows
+    };
   }
 
-  private void loadData(String hadoopDir)
+  @Test
+  public void testLegacyITHadoopIndexTest() throws Exception
   {
-    String indexerSpec;
+    try (
+        final Closeable ignored0 = unloader(BATCH_DATASOURCE + 
config.getExtraDatasourceNameSuffix());
+    ) {
+      final Function<String, String> specPathsTransform = spec -> {
+        try {
+          String path = "/batch_index/tsv";
 
 Review comment:
   currently /batch_index/tsv requires manual setup 
   From the integration-tests/README.md ... 
   ```
   Currently, ITHadoopIndexTest can only be run with your own Druid + Hadoop 
cluster by following the below steps:
   Create a directory called batchHadoop1 in the hadoop file system
   (anywhere you want) and put batch_hadoop.data 
(integration-tests/src/test/resources/hadoop/batch_hadoop.data) 
   into that directory (as its only file).
   ```
   We should automatically setup this dir for the hadoop docker container 
(similar to how we setup the wikipedia json files). You can create a new dir in 
integration-tests/src/test/resources/data/batch_index called tsv and copy 
integration-tests/src/test/resources/hadoop/batch_hadoop.data to 
integration-tests/src/test/resources/data/batch_index/tsv (the run-cluster 
script should handle the rest and create /batch_index/tsv with 
batch_hadoop.data inside)

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [druid] maytasm commented on a change in pull request #9714: More Hadoop integration tests

Reply via email to