This is an automated email from the ASF dual-hosted git repository.
yqm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git
The following commit(s) were added to refs/heads/master by this push:
new 8a2d2ed3c12 Add embedded test for object-storage-encoding format for
nested field (#18818)
8a2d2ed3c12 is described below
commit 8a2d2ed3c12a6b4188d8d014b780ae99e4efeabb
Author: Cece Mei <[email protected]>
AuthorDate: Tue Dec 9 11:51:24 2025 -0800
Add embedded test for object-storage-encoding format for nested field
(#18818)
* nested-it
* checkstyle
* license
* fix-test
* emitter-config
* setup
* prefix
* small-update
* Update
embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/NestedDataFormatsTest.java
Co-authored-by: Kashif Faraz <[email protected]>
* datasourceWithDefaultFormat
* prefix
* Update
embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/NestedDataFormatsTest.java
Co-authored-by: Kashif Faraz <[email protected]>
* style
* sql
* nested
* client-sql
* format
---------
Co-authored-by: Kashif Faraz <[email protected]>
---
.../embedded/indexing/NestedDataFormatsTest.java | 139 +++++++++++++++++++++
.../druid/indexing/common/task/TaskBuilder.java | 12 +-
pom.xml | 4 +-
.../testing/embedded/EmbeddedClusterApis.java | 5 +-
.../druid/testing/embedded/indexing/Resources.java | 17 +++
5 files changed, 169 insertions(+), 8 deletions(-)
diff --git
a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/NestedDataFormatsTest.java
b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/NestedDataFormatsTest.java
new file mode 100644
index 00000000000..2f892e86d02
--- /dev/null
+++
b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/NestedDataFormatsTest.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.testing.embedded.indexing;
+
+import org.apache.druid.indexing.common.task.TaskBuilder;
+import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.segment.IndexSpec;
+import org.apache.druid.segment.nested.NestedCommonFormatColumnFormatSpec;
+import org.apache.druid.segment.nested.ObjectStorageEncoding;
+import org.apache.druid.testing.embedded.EmbeddedBroker;
+import org.apache.druid.testing.embedded.EmbeddedClusterApis;
+import org.apache.druid.testing.embedded.EmbeddedCoordinator;
+import org.apache.druid.testing.embedded.EmbeddedDruidCluster;
+import org.apache.druid.testing.embedded.EmbeddedHistorical;
+import org.apache.druid.testing.embedded.EmbeddedIndexer;
+import org.apache.druid.testing.embedded.EmbeddedOverlord;
+import org.apache.druid.testing.embedded.EmbeddedRouter;
+import org.apache.druid.testing.embedded.junit5.EmbeddedClusterTestBase;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Embedded tests for nested data, ingested in different {@link
NestedCommonFormatColumnFormatSpec}.
+ */
+public class NestedDataFormatsTest extends EmbeddedClusterTestBase
+{
+ private final EmbeddedBroker broker = new EmbeddedBroker();
+ private final EmbeddedOverlord overlord = new EmbeddedOverlord();
+ private final EmbeddedCoordinator coordinator = new EmbeddedCoordinator();
+
+ private final String datasourceWithDefaultFormat =
EmbeddedClusterApis.createTestDatasourceName();
+
+ @Override
+ protected EmbeddedDruidCluster createCluster()
+ {
+ return EmbeddedDruidCluster.withEmbeddedDerbyAndZookeeper()
+ .useLatchableEmitter()
+ .useDefaultTimeoutForLatchableEmitter(120)
+ .addServer(overlord)
+ .addServer(coordinator)
+ .addServer(new EmbeddedIndexer())
+ .addServer(new EmbeddedHistorical())
+ .addServer(broker)
+ .addServer(new EmbeddedRouter());
+ }
+
+ @BeforeAll
+ protected void ingestWithDefaultFormat()
+ {
+ final TaskBuilder.IndexParallel indexTask =
+ TaskBuilder.ofTypeIndexParallel()
+ .dataSource(datasourceWithDefaultFormat)
+ .timestampColumn("timestamp")
+ .jsonInputFormat()
+ .inputSource(Resources.HttpData.kttmNested1Day())
+ .schemaDiscovery();
+
+ final String taskId =
EmbeddedClusterApis.newTaskId(datasourceWithDefaultFormat);
+ cluster.callApi().runTask(indexTask.withId(taskId), overlord);
+
cluster.callApi().waitForAllSegmentsToBeAvailable(datasourceWithDefaultFormat,
coordinator, broker);
+ }
+
+ @Test
+ public void test_objectStorageEncoding()
+ {
+ // Ingest kttm data with skipping smile raw json format, comparing diff
with defaultFormat
+ NestedCommonFormatColumnFormatSpec spec =
+
NestedCommonFormatColumnFormatSpec.builder().setObjectStorageEncoding(ObjectStorageEncoding.NONE).build();
+ final TaskBuilder.IndexParallel indexTask =
+ TaskBuilder.ofTypeIndexParallel()
+ .dataSource(dataSource)
+ .timestampColumn("timestamp")
+ .jsonInputFormat()
+ .inputSource(Resources.HttpData.kttmNested1Day())
+ .schemaDiscovery()
+ .tuningConfig(t ->
t.withIndexSpec(IndexSpec.builder().withAutoColumnFormatSpec(spec).build()));
+ final String taskId = EmbeddedClusterApis.newTaskId(dataSource);
+ cluster.callApi().runTask(indexTask.withId(taskId), overlord);
+ cluster.callApi().waitForAllSegmentsToBeAvailable(dataSource, coordinator,
broker);
+
+ // Test ingesting with skipping raw json smile format works, same row
count, with ~20% storage saving
+ final String metadataSql = "select sum(num_rows), sum(size) from
sys.segments where datasource = '%s'";
+ final String defaultFormatResult = cluster.runSql(metadataSql,
datasourceWithDefaultFormat);
+ final String noneObjectStorageFormatResult = cluster.runSql(metadataSql,
dataSource);
+ Assertions.assertEquals(StringUtils.format("%d,%d", 465_346, 53_000_804),
defaultFormatResult);
+ Assertions.assertEquals(StringUtils.format("%d,%d", 465_346, 41_938_750),
noneObjectStorageFormatResult);
+
+ // Test querying on a nested field works
+ final String groupByQuery =
+ """
+ select json_value(event, '$.type') as event_type, count(*) as
total from %s
+ group by 1 order by 2 desc, 1 asc limit 10
+ """;
+ final String queryResultDefaultFormat = cluster.runSql(groupByQuery,
datasourceWithDefaultFormat);
+ final String queryResultNoneObjectStorage = cluster.runSql(groupByQuery,
dataSource);
+ Assertions.assertEquals(queryResultDefaultFormat,
queryResultNoneObjectStorage);
+
+ // Test reconstruct json column works, the ordering of the fields has
changed, but all values are perserved.
+ final String scanQuery =
+ """
+ select event, to_json_string(agent) as agent from %s
+ where json_value(event, '$.type') = 'PercentClear' and
json_value(agent, '$.os') = 'Android'
+ order by __time asc limit 1
+ """;
+ final String scanQueryResultDefaultFormat = cluster.runSql(scanQuery,
datasourceWithDefaultFormat);
+ final String scanQueryResultNoneObjectStorage = cluster.runSql(scanQuery,
dataSource);
+ // CHECKSTYLE: text blocks not supported in current Checkstyle version
+ Assertions.assertEquals(
+ """
+
"{""type"":""PercentClear"",""percentage"":85}","{""type"":""Mobile
Browser"",""category"":""Smartphone"",""browser"":""Chrome
Mobile"",""browser_version"":""50.0.2661.89"",""os"":""Android"",""platform"":""Android""}"
+ """.trim(),
+ scanQueryResultDefaultFormat
+ );
+ Assertions.assertEquals(
+ """
+
"{""percentage"":85,""type"":""PercentClear""}","{""browser"":""Chrome
Mobile"",""browser_version"":""50.0.2661.89"",""category"":""Smartphone"",""os"":""Android"",""platform"":""Android"",""type"":""Mobile
Browser""}"
+ """.trim(),
+ scanQueryResultNoneObjectStorage
+ );
+ }
+}
diff --git
a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/TaskBuilder.java
b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/TaskBuilder.java
index 15f0d62ac45..31a8d783603 100644
---
a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/TaskBuilder.java
+++
b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/TaskBuilder.java
@@ -62,9 +62,9 @@ import java.util.stream.Stream;
* explicitly.
*
* @param <Self> Type of this builder itself
- * @param <C> Type of tuning config used by this builder.
- * @param <T> Type of task created by this builder.
- * @param <CB> Type of tuning config builder
+ * @param <C> Type of tuning config used by this builder.
+ * @param <T> Type of task created by this builder.
+ * @param <CB> Type of tuning config builder
* @see #ofTypeIndex()
* @see #tuningConfig(Consumer) to specify the {@code tuningConfig}.
*/
@@ -286,6 +286,12 @@ public abstract class TaskBuilder<
return (Self) this;
}
+ public Self schemaDiscovery()
+ {
+
dataSchema.withDimensions(DimensionsSpec.builder().useSchemaDiscovery(true).build());
+ return (Self) this;
+ }
+
public Self metricAggregates(AggregatorFactory... aggregators)
{
dataSchema.withAggregators(aggregators);
diff --git a/pom.xml b/pom.xml
index 13ce80b8f2a..55c65a262d4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -70,7 +70,7 @@
</scm>
<properties>
- <java.version>11</java.version>
+ <java.version>17</java.version>
<maven.compiler.release>${java.version}</maven.compiler.release>
<project.build.resourceEncoding>UTF-8</project.build.resourceEncoding>
<aether.version>0.9.0.M2</aether.version>
@@ -1572,7 +1572,7 @@
<consoleOutput>true</consoleOutput>
<failsOnError>true</failsOnError>
<excludes>
- *com/fasterxml/jackson/databind/*
+
*com/fasterxml/jackson/databind/*,**/NestedDataFormatsTest.java
</excludes>
</configuration>
<dependencies>
diff --git
a/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java
b/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java
index d93ae182f4b..0a1a1051f2e 100644
---
a/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java
+++
b/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java
@@ -41,7 +41,6 @@ import org.apache.druid.java.util.common.guava.Comparators;
import org.apache.druid.query.DruidMetrics;
import org.apache.druid.query.http.ClientSqlQuery;
import org.apache.druid.rpc.indexing.OverlordClient;
-import org.apache.druid.segment.TestDataSource;
import org.apache.druid.segment.TestHelper;
import org.apache.druid.server.metrics.LatchableEmitter;
import org.apache.druid.sql.http.ResultFormat;
@@ -421,11 +420,11 @@ public class EmbeddedClusterApis implements
EmbeddedResource
// STATIC UTILITY METHODS
/**
- * Creates a random datasource name prefixed with {@link
TestDataSource#WIKI}.
+ * Creates a random datasource name prefixed with {@code datasource_}.
*/
public static String createTestDatasourceName()
{
- return TestDataSource.WIKI + "_" + IdUtils.getRandomId();
+ return "datasource_" + IdUtils.getRandomId();
}
/**
diff --git
a/services/src/test/java/org/apache/druid/testing/embedded/indexing/Resources.java
b/services/src/test/java/org/apache/druid/testing/embedded/indexing/Resources.java
index 1e707aa8835..9070a90ac9d 100644
---
a/services/src/test/java/org/apache/druid/testing/embedded/indexing/Resources.java
+++
b/services/src/test/java/org/apache/druid/testing/embedded/indexing/Resources.java
@@ -131,6 +131,23 @@ public class Resources
throw new RuntimeException(e);
}
}
+
+ public static HttpInputSource kttmNested1Day()
+ {
+ try {
+ return new HttpInputSource(
+ List.of(new
URIBuilder("https://static.imply.io/example-data/kttm-nested-v2/kttm-nested-v2-2019-08-25.json.gz").build()),
+ null,
+ null,
+ null,
+ null,
+ new HttpInputSourceConfig(null, null)
+ );
+ }
+ catch (URISyntaxException e) {
+ throw new RuntimeException(e);
+ }
+ }
}
/**
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]