[ https://issues.apache.org/jira/browse/DRILL-5956?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17141649#comment-17141649 ]
ASF GitHub Bot commented on DRILL-5956: --------------------------------------- cgivre commented on a change in pull request #1888: URL: https://github.com/apache/drill/pull/1888#discussion_r443290498 ########## File path: contrib/storage-druid/src/test/java/org/apache/drill/exec/store/druid/rest/DruidQueryClientTest.java ########## @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.druid.rest; + +import org.apache.drill.exec.store.druid.druid.DruidSelectResponse; +import org.apache.http.HttpStatus; +import org.apache.http.HttpResponse; +import org.apache.http.StatusLine; +import org.apache.http.HttpEntity; +import org.apache.http.Header; +import org.apache.http.HttpHeaders; +import org.apache.http.message.BasicHeader; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mock; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import static org.assertj.core.api.Assertions.assertThat; + +public class DruidQueryClientTest { + + @Mock + private RestClient restClient; + + @Mock + private HttpResponse httpResponse; + + @Mock + private StatusLine statusLine; + + @Mock + private HttpEntity httpEntity; + + private DruidQueryClient druidQueryClient; + private static String BROKER_URI = "some broker uri"; + private static String QUERY = "some query"; + private static Header ENCODING_HEADER = Review comment: Nit: These three variables can be `final`. ########## File path: contrib/storage-druid/src/test/java/org/apache/drill/exec/store/druid/DruidStoragePluginConfigTest.java ########## @@ -31,7 +32,7 @@ public class DruidStoragePluginConfigTest { @Test - public void testDruidStoragePluginConfigSuccessfullyParsed() + public void test_druid_storage_plugin_config_successfully_parsed() Review comment: @akkapur It is a Drill standard to use camelCase for class and variable names. Please revert back to original name. ########## File path: contrib/storage-druid/src/main/java/org/apache/drill/exec/store/druid/DruidRecordReader.java ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.druid; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.apache.drill.common.exceptions.DrillRuntimeException; +import org.apache.drill.common.expression.SchemaPath; +import org.apache.drill.exec.ops.FragmentContext; +import org.apache.drill.exec.ops.OperatorContext; +import org.apache.drill.exec.physical.impl.OutputMutator; +import org.apache.drill.exec.store.AbstractRecordReader; +import org.apache.drill.exec.store.druid.common.DruidFilter; +import org.apache.drill.exec.store.druid.druid.DruidSelectResponse; +import org.apache.drill.exec.store.druid.druid.PagingIdentifier; +import org.apache.drill.exec.store.druid.druid.PagingSpec; +import org.apache.drill.exec.store.druid.druid.SelectQuery; +import org.apache.drill.exec.store.druid.druid.SelectQueryBuilder; +import org.apache.drill.exec.store.druid.rest.DruidQueryClient; +import org.apache.drill.exec.vector.BaseValueVector; +import org.apache.drill.exec.vector.complex.fn.JsonReader; +import org.apache.drill.exec.vector.complex.impl.VectorContainerWriter; +import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch; +import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; +import org.apache.drill.shaded.guava.com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +public class DruidRecordReader extends AbstractRecordReader { + + private static final Logger logger = LoggerFactory.getLogger(DruidRecordReader.class); + private static final ObjectMapper objectMapper = new ObjectMapper(); + private final DruidStoragePlugin plugin; + private final DruidSubScan.DruidSubScanSpec scanSpec; + private final List<String> dimensions; + private final DruidFilter filter; + private ArrayList<PagingIdentifier> pagingIdentifiers = new ArrayList<>(); + private int maxRecordsToRead = -1; + + private JsonReader jsonReader; + private VectorContainerWriter writer; + + private final FragmentContext fragmentContext; + + public DruidRecordReader(DruidSubScan.DruidSubScanSpec subScanSpec, + List<SchemaPath> projectedColumns, + int maxRecordsToRead, + FragmentContext context, + DruidStoragePlugin plugin) { + dimensions = new ArrayList<>(); + setColumns(projectedColumns); + this.maxRecordsToRead = maxRecordsToRead; + this.plugin = plugin; + scanSpec = subScanSpec; + fragmentContext = context; + this.filter = subScanSpec.getFilter(); + } + + @Override + protected Collection<SchemaPath> transformColumns(Collection<SchemaPath> projectedColumns) { + Set<SchemaPath> transformed = Sets.newLinkedHashSet(); + if (isStarQuery()) { + transformed.add(SchemaPath.STAR_COLUMN); + } else { + for (SchemaPath column : projectedColumns) { + String fieldName = column.getRootSegment().getPath(); + transformed.add(column); + this.dimensions.add(fieldName); + } + } + return transformed; + } + + @Override + public void setup(OperatorContext context, OutputMutator output) { + this.writer = new VectorContainerWriter(output); + + this.jsonReader = + new JsonReader.Builder(fragmentContext.getManagedBuffer()) + .schemaPathColumns(ImmutableList.copyOf(getColumns())) + .skipOuterList(true) + .build(); + } + + @Override + public int next() { + writer.allocate(); + writer.reset(); + DruidQueryClient druidQueryClient = plugin.getDruidQueryClient(); + Stopwatch watch = Stopwatch.createStarted(); + try { + DruidSelectResponse druidSelectResponse = druidQueryClient.executeQuery(getQuery()); + setNextPagingIdentifiers(druidSelectResponse); + + int docCount = 0; + for (ObjectNode eventNode : druidSelectResponse.getEvents()) { + writer.setPosition(docCount); + jsonReader.setSource(eventNode); + try { + jsonReader.write(writer); + } catch (IOException e) { + String msg = "Failure while reading document. - Parser was at record: " + eventNode.toString(); + logger.error(msg, e); + throw new DrillRuntimeException(msg, e); + } + docCount++; + } + + writer.setValueCount(docCount); + logger.debug("Took {} ms to get {} records", watch.elapsed(TimeUnit.MILLISECONDS), docCount); + return docCount; + } catch (Exception e) { + String msg = "Failure while reading documents"; + logger.error(msg, e); + throw new DrillRuntimeException(msg, e); Review comment: Here and above, Can you please throw a `UserException` rather than a `DrillRuntimeException`? Something like this: ``` throw UserException .dataReadError(e) .message(msg) .build(logger); ``` If you can access the `errorContext` it is also helpful (but not necessary) to add that as well via the `addContext()` method. That way the user gets helpful error messages if something goes wrong. ########## File path: contrib/storage-druid/src/main/java/org/apache/drill/exec/store/druid/DruidRecordReader.java ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.druid; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.apache.drill.common.exceptions.DrillRuntimeException; +import org.apache.drill.common.expression.SchemaPath; +import org.apache.drill.exec.ops.FragmentContext; +import org.apache.drill.exec.ops.OperatorContext; +import org.apache.drill.exec.physical.impl.OutputMutator; +import org.apache.drill.exec.store.AbstractRecordReader; +import org.apache.drill.exec.store.druid.common.DruidFilter; +import org.apache.drill.exec.store.druid.druid.DruidSelectResponse; +import org.apache.drill.exec.store.druid.druid.PagingIdentifier; +import org.apache.drill.exec.store.druid.druid.PagingSpec; +import org.apache.drill.exec.store.druid.druid.SelectQuery; +import org.apache.drill.exec.store.druid.druid.SelectQueryBuilder; +import org.apache.drill.exec.store.druid.rest.DruidQueryClient; +import org.apache.drill.exec.vector.BaseValueVector; +import org.apache.drill.exec.vector.complex.fn.JsonReader; +import org.apache.drill.exec.vector.complex.impl.VectorContainerWriter; +import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch; +import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; +import org.apache.drill.shaded.guava.com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +public class DruidRecordReader extends AbstractRecordReader { + + private static final Logger logger = LoggerFactory.getLogger(DruidRecordReader.class); + private static final ObjectMapper objectMapper = new ObjectMapper(); + private final DruidStoragePlugin plugin; + private final DruidSubScan.DruidSubScanSpec scanSpec; + private final List<String> dimensions; + private final DruidFilter filter; + private ArrayList<PagingIdentifier> pagingIdentifiers = new ArrayList<>(); + private int maxRecordsToRead = -1; + + private JsonReader jsonReader; + private VectorContainerWriter writer; + + private final FragmentContext fragmentContext; + + public DruidRecordReader(DruidSubScan.DruidSubScanSpec subScanSpec, + List<SchemaPath> projectedColumns, + int maxRecordsToRead, + FragmentContext context, + DruidStoragePlugin plugin) { + dimensions = new ArrayList<>(); + setColumns(projectedColumns); + this.maxRecordsToRead = maxRecordsToRead; + this.plugin = plugin; + scanSpec = subScanSpec; + fragmentContext = context; + this.filter = subScanSpec.getFilter(); + } + + @Override + protected Collection<SchemaPath> transformColumns(Collection<SchemaPath> projectedColumns) { + Set<SchemaPath> transformed = Sets.newLinkedHashSet(); + if (isStarQuery()) { + transformed.add(SchemaPath.STAR_COLUMN); + } else { + for (SchemaPath column : projectedColumns) { + String fieldName = column.getRootSegment().getPath(); + transformed.add(column); + this.dimensions.add(fieldName); + } + } + return transformed; + } + + @Override + public void setup(OperatorContext context, OutputMutator output) { + this.writer = new VectorContainerWriter(output); + + this.jsonReader = + new JsonReader.Builder(fragmentContext.getManagedBuffer()) + .schemaPathColumns(ImmutableList.copyOf(getColumns())) + .skipOuterList(true) + .build(); + } + + @Override + public int next() { + writer.allocate(); + writer.reset(); + DruidQueryClient druidQueryClient = plugin.getDruidQueryClient(); Review comment: It it necessary to do this in every batch? I'd recommend moving this to the `setup()` function so that this is only called once. ########## File path: contrib/storage-druid/src/main/java/org/apache/drill/exec/store/druid/DruidGroupScan.java ########## @@ -0,0 +1,249 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.druid; + +import com.fasterxml.jackson.annotation.JacksonInject; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonTypeName; +import com.fasterxml.jackson.annotation.JsonIgnore; + +import org.apache.drill.common.PlanStringBuilder; +import org.apache.drill.common.expression.SchemaPath; +import org.apache.drill.exec.physical.EndpointAffinity; +import org.apache.drill.exec.physical.base.AbstractGroupScan; +import org.apache.drill.exec.physical.base.GroupScan; +import org.apache.drill.exec.physical.base.PhysicalOperator; +import org.apache.drill.exec.physical.base.ScanStats; +import org.apache.drill.exec.proto.CoordinationProtos; +import org.apache.drill.exec.store.StoragePluginRegistry; + +import org.apache.drill.exec.store.schedule.AffinityCreator; +import org.apache.drill.exec.store.schedule.AssignmentCreator; +import org.apache.drill.exec.store.schedule.CompleteWork; +import org.apache.drill.exec.store.schedule.EndpointByteMap; +import org.apache.drill.exec.store.schedule.EndpointByteMapImpl; +import org.apache.drill.shaded.guava.com.google.common.base.Preconditions; +import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap; +import org.apache.drill.shaded.guava.com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +@JsonTypeName("druid-scan") +public class DruidGroupScan extends AbstractGroupScan { + + private static final Logger logger = LoggerFactory.getLogger(DruidGroupScan.class); + private static final long DEFAULT_TABLET_SIZE = 1000; + private final DruidScanSpec scanSpec; + private final DruidStoragePlugin storagePlugin; + + private List<SchemaPath> columns; + private boolean filterPushedDown = false; + private List<DruidWork> druidWorkList = new ArrayList<>(); + private ListMultimap<Integer,DruidWork> assignments; + private List<EndpointAffinity> affinities; + + @JsonCreator + public DruidGroupScan(@JsonProperty("userName") String userName, + @JsonProperty("scanSpec") DruidScanSpec scanSpec, + @JsonProperty("storagePluginConfig") DruidStoragePluginConfig storagePluginConfig, + @JsonProperty("columns") List<SchemaPath> columns, + @JacksonInject StoragePluginRegistry pluginRegistry) { + this(userName, + pluginRegistry.resolve(storagePluginConfig, DruidStoragePlugin.class), + scanSpec, + columns); + } + + public DruidGroupScan(String userName, DruidStoragePlugin storagePlugin, DruidScanSpec scanSpec, + List<SchemaPath> columns) { + super(userName); + this.storagePlugin = storagePlugin; + this.scanSpec = scanSpec; + this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns; + init(); + } + + /** + * Private constructor, used for cloning. + * @param that The DruidGroupScan to clone + */ + private DruidGroupScan(DruidGroupScan that) { + super(that); + this.columns = that.columns; + this.scanSpec = that.scanSpec; + this.storagePlugin = that.storagePlugin; + this.filterPushedDown = that.filterPushedDown; + this.druidWorkList = that.druidWorkList; + this.assignments = that.assignments; + } + + @Override + public GroupScan clone(List<SchemaPath> columns) { + DruidGroupScan newScan = new DruidGroupScan(this); + newScan.columns = columns; + return newScan; + } + + @Override + public List<EndpointAffinity> getOperatorAffinity() { + if (affinities == null) { + affinities = AffinityCreator.getAffinityMap(druidWorkList); + } + return affinities; + } + + @Override + public boolean canPushdownProjects(List<SchemaPath> columns) { + return true; + } + + @JsonIgnore + public boolean isFilterPushedDown() { + return filterPushedDown; + } + + @JsonIgnore + public void setFilterPushedDown(boolean filterPushedDown) { + this.filterPushedDown = filterPushedDown; + } + + private void init() { + logger.debug("Adding Druid Work for Table - {}. Filter - {}", getTableName(), getScanSpec().getFilter()); + + DruidWork druidWork = + new DruidWork( + new DruidSubScan.DruidSubScanSpec( + getTableName(), + getScanSpec().getFilter() + ) + ); + druidWorkList.add(druidWork); + } + + private static class DruidWork implements CompleteWork { + private final EndpointByteMapImpl byteMap = new EndpointByteMapImpl(); + private final DruidSubScan.DruidSubScanSpec druidSubScanSpec; + + public DruidWork(DruidSubScan.DruidSubScanSpec druidSubScanSpec) { + this.druidSubScanSpec = druidSubScanSpec; + } + + public DruidSubScan.DruidSubScanSpec getDruidSubScanSpec() { + return druidSubScanSpec; + } + + @Override + public long getTotalBytes() { + return DEFAULT_TABLET_SIZE; + } + + @Override + public EndpointByteMap getByteMap() { + return byteMap; + } + + @Override + public int compareTo(CompleteWork o) { + return 0; + } + } + + //TODO - MAY GET MORE PRECISE COUNT FROM DRUID ITSELF. + public ScanStats getScanStats() { + long recordCount = 100000 * druidWorkList.size(); + return new ScanStats( + ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, + recordCount, + 1, + recordCount * storagePlugin.getConfig().getAverageRowSizeBytes()); + } + + @Override + public void applyAssignments(List<CoordinationProtos.DrillbitEndpoint> endpoints) { + assignments = AssignmentCreator.getMappings(endpoints, druidWorkList); + } + + @Override + public DruidSubScan getSpecificScan(int minorFragmentId) { + + List<DruidWork> workList = assignments.get(minorFragmentId); + + List<DruidSubScan.DruidSubScanSpec> scanSpecList = Lists.newArrayList(); + for (DruidWork druidWork : workList) { + scanSpecList + .add( + new DruidSubScan.DruidSubScanSpec( + druidWork.getDruidSubScanSpec().getDataSourceName(), + druidWork.getDruidSubScanSpec().getFilter() + ) + ); + } + + return new DruidSubScan(getUserName(), storagePlugin, scanSpecList, this.columns); + } + + @JsonIgnore + public String getTableName() { + return getScanSpec().getDataSourceName(); + } + + @Override + public int getMaxParallelizationWidth() { + return druidWorkList.size(); + } + Review comment: @akkapur One thing more... I recently figured out how to do a `LIMIT` pushdown which can be a big performance improvement. Basically you have to implement two methods here and add a variable called `maxRecords`. ``` @Override public boolean supportsLimitPushdown() { return true; } @Override public GroupScan applyLimit(int maxRecords) { if (maxRecordsToRead == maxRecords) { return null; } return clone(this, maxRecords); } ``` Once these methods have been implemented, the next step is to read that variable when you issue the query to Druid and push the limit down to Druid. ########## File path: contrib/storage-druid/README.md ########## @@ -28,4 +31,28 @@ Following is the default registration configuration. ### Developer -Building - `mvn install -pl contrib/storage-druid` +* Building the plugin + + `mvn install -pl contrib/storage-druid` + +* Building DRILL Review comment: @akkapur Thanks for updating the docs. Just an FYI, once this is committed, it will be part of Drill, so they don't really need to build anything separate to get it to work. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add Storage Plugin for Apache Druid > ----------------------------------- > > Key: DRILL-5956 > URL: https://issues.apache.org/jira/browse/DRILL-5956 > Project: Apache Drill > Issue Type: Wish > Components: Storage - Other > Reporter: Jiaqi Liu > Priority: Major > Labels: Enhancement, Storage-Plugin > Fix For: 1.18.0 > > > As more and more companies are using Druid for mission-critical industrial > products, Drill could gain much more popularity with Druid as one of its > supported storage plugin so that uses could easily bind Druid cluster to > running Drill instance -- This message was sent by Atlassian Jira (v8.3.4#803005)