Re: [PR] migrate hdfs integration tests to embedded-tests (druid)

via GitHub Sun, 15 Mar 2026 20:59:55 -0700


kfaraz commented on code in PR #19158:
URL: https://github.com/apache/druid/pull/19158#discussion_r2938012208



##########
embedded-tests/src/test/java/org/apache/druid/testing/embedded/hdfs/HdfsToS3ParallelIndexTest.java:
##########
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.testing.embedded.hdfs;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.java.util.common.logger.Logger;
+import org.apache.druid.testing.embedded.EmbeddedDruidCluster;
+import org.apache.druid.testing.embedded.minio.MinIOStorageResource;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+import software.amazon.awssdk.services.s3.model.DeleteObjectRequest;
+import software.amazon.awssdk.services.s3.model.ListObjectsRequest;
+
+/**
+ * Embedded parallel index test that reads input data from HDFS (in-process 
MiniDFSCluster)
+ * and stores segments in S3 (MinIO testcontainer).
+ *
+ * <p>The MinIO resource is registered after the HDFS resource so that the S3 
deep-storage
+ * configuration ({@code druid.storage.type=s3}) is applied last. The HDFS 
input-source
+ * connection properties ({@code hadoop.fs.defaultFS}) remain active and are 
used by the
+ * indexer to read data from HDFS.
+ */
+public class HdfsToS3ParallelIndexTest extends 
AbstractHdfsInputSourceParallelIndexTest
+{
+  private static final Logger LOG = new 
Logger(HdfsToS3ParallelIndexTest.class);
+
+  /** HDFS is the input source only — deep storage is S3/MinIO. */
+  private final HdfsStorageResource hdfsResource = new 
HdfsStorageResource(false);
+  private final MinIOStorageResource minIOResource = new 
MinIOStorageResource();
+
+  @Override
+  protected HdfsStorageResource getHdfsResource()
+  {
+    return hdfsResource;
+  }
+
+  @Override
+  protected void addResources(EmbeddedDruidCluster cluster)
+  {
+    // HDFS resource: starts the MiniDFSCluster and sets hadoop.fs.defaultFS 
so the indexer
+    // can read from HDFS. configureAsDeepStorage=false means it does NOT set 
druid.storage.type.
+    cluster.addResource(hdfsResource);
+
+    // MinIO resource: configures S3/MinIO as deep storage 
(druid.storage.type=s3, etc.).
+    // Adding it after the HDFS resource ensures the S3 deep-storage settings 
win.
+    // MinIOStorageResource.onStarted() registers S3StorageDruidModule 
automatically.

Review Comment:
   We can omit this line.



##########
embedded-tests/src/test/java/org/apache/druid/testing/embedded/hdfs/HdfsToAzureParallelIndexTest.java:
##########
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.testing.embedded.hdfs;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.java.util.common.logger.Logger;
+import org.apache.druid.testing.embedded.EmbeddedDruidCluster;
+import org.apache.druid.testing.embedded.azure.AzureStorageResource;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+
+/**
+ * Embedded parallel index test that reads input data from HDFS (in-process 
MiniDFSCluster)
+ * and stores segments in Azure Blob Storage (Azurite testcontainer).
+ */
+public class HdfsToAzureParallelIndexTest extends 
AbstractHdfsInputSourceParallelIndexTest
+{
+  private static final Logger LOG = new 
Logger(HdfsToAzureParallelIndexTest.class);
+
+  /** HDFS is the input source only — deep storage is Azure. */
+  private final HdfsStorageResource hdfsResource = new 
HdfsStorageResource(false);
+  private final AzureStorageResource azureResource = new 
AzureStorageResource();
+
+  @Override
+  protected HdfsStorageResource getHdfsResource()
+  {
+    return hdfsResource;
+  }
+
+  @Override
+  protected void addResources(EmbeddedDruidCluster cluster)
+  {
+    // HDFS resource: starts the MiniDFSCluster and sets hadoop.fs.defaultFS 
so the indexer
+    // can read from HDFS. It does NOT configure Azure-overriding deep-storage 
properties.
+    cluster.addResource(hdfsResource);
+
+    // Azure resource: configures Azure as deep storage 
(druid.storage.type=azure, etc.).
+    // Adding it after the HDFS resource ensures Azure's deep-storage settings 
are not
+    // overridden by anything the HDFS resource might set.
+    // AzureStorageResource.onStarted() registers AzureStorageDruidModule 
automatically.
+    cluster.addResource(azureResource);
+  }
+
+  @AfterAll
+  public void deleteSegmentsFromAzure()
+  {
+    try {
+      // The AzureStorageResource creates the container; deleting it cleans up 
all segments
+      // written during the test run.
+      azureResource.getStorageClient()
+                   
.getContainerReference(azureResource.getAzureContainerName())
+                   .deleteIfExists();

Review Comment:
   `AbstractAzureInputSourceParallelIndexTest` simply does 
`azureResource.deleteStorageContainer()` for this. Would that not suffice?



##########
embedded-tests/src/test/java/org/apache/druid/testing/embedded/hdfs/HdfsStorageResource.java:
##########
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.testing.embedded.hdfs;
+
+import org.apache.druid.java.util.common.FileUtils;
+import org.apache.druid.storage.hdfs.HdfsStorageDruidModule;
+import org.apache.druid.testing.embedded.EmbeddedDruidCluster;
+import org.apache.druid.testing.embedded.EmbeddedResource;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+
+import java.io.File;
+import java.io.IOException;
+
+/**
+ * An {@link EmbeddedResource} that starts an in-process HDFS cluster using 
{@link MiniDFSCluster}.
+ * No Docker container is needed; MiniDFSCluster runs entirely within the test 
JVM.
+ *
+ * <p>When {@code configureAsDeepStorage} is {@code true} (the default), this 
resource also
+ * configures the embedded Druid cluster to use HDFS for segment deep storage 
and task logs.
+ * When {@code false}, it only provides HDFS connectivity (useful when HDFS is 
the input source
+ * but a different system is the deep storage).
+ */
+public class HdfsStorageResource implements EmbeddedResource
+{
+  private final boolean configureAsDeepStorage;
+  private MiniDFSCluster miniDFSCluster;

Review Comment:
   Nice!



##########
embedded-tests/src/test/java/org/apache/druid/testing/embedded/hdfs/HdfsToGcsParallelIndexTest.java:
##########
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.testing.embedded.hdfs;
+
+import com.google.cloud.storage.Storage;
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.java.util.common.logger.Logger;
+import org.apache.druid.testing.embedded.EmbeddedDruidCluster;
+import org.apache.druid.testing.embedded.gcs.GoogleCloudStorageResource;
+import org.apache.druid.testing.embedded.gcs.GoogleStorageTestModule;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+
+/**
+ * Embedded parallel index test that reads input data from HDFS (in-process 
MiniDFSCluster)
+ * and stores segments in Google Cloud Storage (FakeGcsServer testcontainer).
+ */
+public class HdfsToGcsParallelIndexTest extends 
AbstractHdfsInputSourceParallelIndexTest
+{
+  private static final Logger LOG = new 
Logger(HdfsToGcsParallelIndexTest.class);
+
+  /** HDFS is the input source only — deep storage is GCS. */
+  private final HdfsStorageResource hdfsResource = new 
HdfsStorageResource(false);
+  private final GoogleCloudStorageResource gcsResource = new 
GoogleCloudStorageResource();
+
+  @Override
+  protected HdfsStorageResource getHdfsResource()
+  {
+    return hdfsResource;
+  }
+
+  @Override
+  protected void addResources(EmbeddedDruidCluster cluster)
+  {
+    // HDFS resource: starts the MiniDFSCluster and sets hadoop.fs.defaultFS.
+    // Does NOT configure HDFS as deep storage — GCS fills that role.
+    cluster.addResource(hdfsResource);
+
+    // GCS resource: configures GCS as deep storage 
(druid.storage.type=google, etc.)
+    // and creates the GCS bucket. GoogleCloudStorageResource.onStarted() sets 
the
+    // storageUrl from the running FakeGcsServer container.
+    cluster.addResource(gcsResource);
+  }
+
+  @AfterAll
+  public void deleteSegmentsFromGcs()
+  {
+    try (Storage storage = 
GoogleStorageTestModule.createStorageForTests(gcsResource.getUrl())) {
+      // Delete all blobs under the deep-storage prefix for this test run.
+      storage.list(gcsResource.getBucket()).iterateAll()
+             .forEach(blob -> blob.delete());
+    }

Review Comment:
   `AbstractGcsInputSourceParallelIndexTest` uses 
`gcsResource.deletePrefixFolderFromGcs(dataSource);` instead of this. I guess 
that is incorrect/insufficient since it would end up deleting only files for 
the last test datasource.
   
   Thinking about it though, I think we might even skip this deletion 
altogether since the cluster itself (along with the GCS testcontainer) will be 
torn down after this anyway.



##########
embedded-tests/src/test/java/org/apache/druid/testing/embedded/hdfs/HdfsToAzureParallelIndexTest.java:
##########
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.testing.embedded.hdfs;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.java.util.common.logger.Logger;
+import org.apache.druid.testing.embedded.EmbeddedDruidCluster;
+import org.apache.druid.testing.embedded.azure.AzureStorageResource;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+
+/**
+ * Embedded parallel index test that reads input data from HDFS (in-process 
MiniDFSCluster)
+ * and stores segments in Azure Blob Storage (Azurite testcontainer).
+ */
+public class HdfsToAzureParallelIndexTest extends 
AbstractHdfsInputSourceParallelIndexTest
+{
+  private static final Logger LOG = new 
Logger(HdfsToAzureParallelIndexTest.class);
+
+  /** HDFS is the input source only — deep storage is Azure. */
+  private final HdfsStorageResource hdfsResource = new 
HdfsStorageResource(false);
+  private final AzureStorageResource azureResource = new 
AzureStorageResource();
+
+  @Override
+  protected HdfsStorageResource getHdfsResource()
+  {
+    return hdfsResource;
+  }
+
+  @Override
+  protected void addResources(EmbeddedDruidCluster cluster)
+  {
+    // HDFS resource: starts the MiniDFSCluster and sets hadoop.fs.defaultFS 
so the indexer
+    // can read from HDFS. It does NOT configure Azure-overriding deep-storage 
properties.
+    cluster.addResource(hdfsResource);
+
+    // Azure resource: configures Azure as deep storage 
(druid.storage.type=azure, etc.).
+    // Adding it after the HDFS resource ensures Azure's deep-storage settings 
are not
+    // overridden by anything the HDFS resource might set.
+    // AzureStorageResource.onStarted() registers AzureStorageDruidModule 
automatically.

Review Comment:
   I think this line can be omitted.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] migrate hdfs integration tests to embedded-tests (druid)

Reply via email to