[
https://issues.apache.org/jira/browse/DRILL-4779?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16256352#comment-16256352
]
ASF GitHub Bot commented on DRILL-4779:
---------------------------------------
Github user paul-rogers commented on a diff in the pull request:
https://github.com/apache/drill/pull/1027#discussion_r151587030
--- Diff:
contrib/storage-kafka/src/main/java/org/apache/drill/exec/store/kafka/KafkaGroupScan.java
---
@@ -0,0 +1,320 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.kafka;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.GroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.physical.base.ScanStats.GroupScanProperty;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.kafka.KafkaSubScan.KafkaSubScanSpec;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.kafka.clients.consumer.KafkaConsumer;
+import org.apache.kafka.common.Node;
+import org.apache.kafka.common.PartitionInfo;
+import org.apache.kafka.common.TopicPartition;
+import org.apache.kafka.common.serialization.ByteArrayDeserializer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ListMultimap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+@JsonTypeName("kafka-scan")
+public class KafkaGroupScan extends AbstractGroupScan {
+
+ private static final Logger logger =
LoggerFactory.getLogger(KafkaGroupScan.class);
+
+ // Assuming default average topic message size as 1KB, which will be
used to
+ // compute the stats and work assignments
+ private static final long MSG_SIZE = 1024;
+
+ private final KafkaStoragePlugin kafkaStoragePlugin;
+ private final KafkaStoragePluginConfig kafkaStoragePluginConfig;
+ private List<SchemaPath> columns;
+ private final KafkaScanSpec kafkaScanSpec;
+
+ private List<PartitionScanWork> partitionWorkList;
+ private ListMultimap<Integer, PartitionScanWork> assignments;
+ private List<EndpointAffinity> affinities;
+
+ @JsonCreator
+ public KafkaGroupScan(@JsonProperty("userName") String userName,
+ @JsonProperty("kafkaStoragePluginConfig") KafkaStoragePluginConfig
kafkaStoragePluginConfig,
+ @JsonProperty("columns") List<SchemaPath> columns,
@JsonProperty("scanSpec") KafkaScanSpec scanSpec,
+ @JacksonInject StoragePluginRegistry pluginRegistry) {
+ this(userName, kafkaStoragePluginConfig, columns, scanSpec,
(KafkaStoragePlugin) pluginRegistry);
+ }
+
+ public KafkaGroupScan(KafkaStoragePlugin kafkaStoragePlugin,
KafkaScanSpec kafkaScanSpec, List<SchemaPath> columns) {
+ super(StringUtils.EMPTY);
+ this.kafkaStoragePlugin = kafkaStoragePlugin;
+ this.kafkaStoragePluginConfig = (KafkaStoragePluginConfig)
kafkaStoragePlugin.getConfig();
+ this.columns = columns;
+ this.kafkaScanSpec = kafkaScanSpec;
+ init();
+ }
+
+ public KafkaGroupScan(String userName, KafkaStoragePluginConfig
kafkaStoragePluginConfig, List<SchemaPath> columns,
+ KafkaScanSpec kafkaScanSpec, KafkaStoragePlugin pluginRegistry) {
+ super(userName);
+ this.kafkaStoragePluginConfig = kafkaStoragePluginConfig;
+ this.columns = columns;
+ this.kafkaScanSpec = kafkaScanSpec;
+ this.kafkaStoragePlugin = pluginRegistry;
+ init();
+ }
+
+ public KafkaGroupScan(KafkaGroupScan that) {
+ super(that);
+ this.kafkaStoragePluginConfig = that.kafkaStoragePluginConfig;
+ this.columns = that.columns;
+ this.kafkaScanSpec = that.kafkaScanSpec;
+ this.kafkaStoragePlugin = that.kafkaStoragePlugin;
+ this.partitionWorkList = that.partitionWorkList;
+ this.assignments = that.assignments;
+ }
+
+ private static class PartitionScanWork implements CompleteWork {
+
+ private final EndpointByteMapImpl byteMap = new EndpointByteMapImpl();
+
+ private final TopicPartition topicPartition;
+ private final long beginOffset;
+ private final long latestOffset;
+
+ public PartitionScanWork(TopicPartition topicPartition, long
beginOffset, long latestOffset) {
+ this.topicPartition = topicPartition;
+ this.beginOffset = beginOffset;
+ this.latestOffset = latestOffset;
+ }
+
+ public TopicPartition getTopicPartition() {
+ return topicPartition;
+ }
+
+ public long getBeginOffset() {
+ return beginOffset;
+ }
+
+ public long getLatestOffset() {
+ return latestOffset;
+ }
+
+ @Override
+ public int compareTo(CompleteWork o) {
+ return Long.compare(getTotalBytes(), o.getTotalBytes());
+ }
+
+ @Override
+ public long getTotalBytes() {
+ return (latestOffset - beginOffset) * MSG_SIZE;
+ }
+
+ @Override
+ public EndpointByteMap getByteMap() {
+ return byteMap;
+ }
+
+ }
+
+ /**
+ * Computes work per topic partition, based on start and end offset of
each
+ * corresponding topicPartition
+ */
+ private void init() {
+ partitionWorkList = Lists.newArrayList();
+ Collection<DrillbitEndpoint> endpoints =
kafkaStoragePlugin.getContext().getBits();
+ Map<String, DrillbitEndpoint> endpointMap = Maps.newHashMap();
+ for (DrillbitEndpoint endpoint : endpoints) {
+ endpointMap.put(endpoint.getAddress(), endpoint);
+ }
+
+ Map<TopicPartition, Long> startOffsetsMap = Maps.newHashMap();
+ Map<TopicPartition, Long> endOffsetsMap = Maps.newHashMap();
+ List<PartitionInfo> topicPartitions = null;
+ String topicName = kafkaScanSpec.getTopicName();
+
+ try (KafkaConsumer<?, ?> kafkaConsumer = new
KafkaConsumer<>(kafkaStoragePlugin.getConfig().getKafkaConsumerProps(),
+ new ByteArrayDeserializer(), new ByteArrayDeserializer())) {
+ if (!kafkaConsumer.listTopics().keySet().contains(topicName)) {
+ throw UserException.permissionError()
+ .message("Table '%s' does not exist", topicName)
+ .build(logger);
+ }
+
+ kafkaConsumer.subscribe(Arrays.asList(topicName));
+ // based on KafkaConsumer JavaDoc, seekToBeginning/seekToEnd
functions
+ // evaluates lazily, seeking to the first/last offset in all
partitions only
+ // when poll(long) or
+ // position(TopicPartition) are called
+ kafkaConsumer.poll(0);
+ Set<TopicPartition> assignments = kafkaConsumer.assignment();
+ topicPartitions = kafkaConsumer.partitionsFor(topicName);
+
+ // fetch start offsets for each topicPartition
+ kafkaConsumer.seekToBeginning(assignments);
+ for (TopicPartition topicPartition : assignments) {
+ startOffsetsMap.put(topicPartition,
kafkaConsumer.position(topicPartition));
+ }
+
+ // fetch end offsets for each topicPartition
+ kafkaConsumer.seekToEnd(assignments);
+ for (TopicPartition topicPartition : assignments) {
+ endOffsetsMap.put(topicPartition,
kafkaConsumer.position(topicPartition));
+ }
+ } catch (Exception e) {
+ logger.error(e.getMessage(), e);
+ throw UserException.dataReadError(e).message("Failed to fetch
start/end offsets of the topic %s", topicName)
+ .addContext(e.getMessage()).build(logger);
+ }
+
+ // computes work for each end point
+ for (PartitionInfo partitionInfo : topicPartitions) {
+ TopicPartition topicPartition = new TopicPartition(topicName,
partitionInfo.partition());
+ long lastCommittedOffset = startOffsetsMap.get(topicPartition);
+ long latestOffset = endOffsetsMap.get(topicPartition);
+ logger.debug("Latest offset of {} is {}", topicPartition,
latestOffset);
+ logger.debug("Last committed offset of {} is {}", topicPartition,
lastCommittedOffset);
+ PartitionScanWork work = new PartitionScanWork(topicPartition,
lastCommittedOffset, latestOffset);
+ Node[] inSyncReplicas = partitionInfo.inSyncReplicas();
+ for (Node isr : inSyncReplicas) {
+ String host = isr.host();
+ DrillbitEndpoint ep = endpointMap.get(host);
+ if (ep != null) {
+ work.getByteMap().add(ep, work.getTotalBytes());
+ }
+ }
+ partitionWorkList.add(work);
+ }
+ }
+
+ @Override
+ public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+ assignments = AssignmentCreator.getMappings(incomingEndpoints,
partitionWorkList);
+ }
+
+ @Override
+ public KafkaSubScan getSpecificScan(int minorFragmentId) {
+ List<PartitionScanWork> workList = assignments.get(minorFragmentId);
+ List<KafkaSubScanSpec> scanSpecList = Lists.newArrayList();
+
+ for (PartitionScanWork work : workList) {
+ scanSpecList.add(new
KafkaSubScanSpec(work.getTopicPartition().topic(),
work.getTopicPartition().partition(),
+ work.getBeginOffset(), work.getLatestOffset()));
+ }
+
+ return new KafkaSubScan(getUserName(), kafkaStoragePlugin,
kafkaStoragePluginConfig, columns, scanSpecList);
+ }
+
+ @Override
+ public int getMaxParallelizationWidth() {
+ return partitionWorkList.size();
+ }
+
+ @Override
+ public ScanStats getScanStats() {
+ long messageCount = 0;
+ for (PartitionScanWork work : partitionWorkList) {
+ messageCount += (work.getLatestOffset() - work.getBeginOffset());
+ }
+ return new ScanStats(GroupScanProperty.EXACT_ROW_COUNT, messageCount,
1, messageCount * MSG_SIZE);
+ }
+
+ @Override
+ public String getDigest() {
+ return toString();
+ }
+
+ @Override
+ public PhysicalOperator getNewWithChildren(List<PhysicalOperator>
children) throws ExecutionSetupException {
+ Preconditions.checkArgument(children.isEmpty());
+ return new KafkaGroupScan(this);
+ }
+
+ @Override
+ public List<EndpointAffinity> getOperatorAffinity() {
+ if (affinities == null) {
+ affinities = AffinityCreator.getAffinityMap(partitionWorkList);
+ }
+ return affinities;
+ }
+
+ @Override
+ @JsonIgnore
+ public boolean canPushdownProjects(List<SchemaPath> columns) {
+ return true;
--- End diff --
By setting this to `true`, your reader must handle projects: read only the
requested columns and fill in null columns for missing values.
Has this been tested to ensure that the JSON reader being used here does
the needed work?
> Kafka storage plugin support
> ----------------------------
>
> Key: DRILL-4779
> URL: https://issues.apache.org/jira/browse/DRILL-4779
> Project: Apache Drill
> Issue Type: New Feature
> Components: Storage - Other
> Affects Versions: 1.11.0
> Reporter: B Anil Kumar
> Assignee: B Anil Kumar
> Labels: doc-impacting
> Fix For: 1.12.0
>
>
> Implement Kafka storage plugin will enable the strong SQL support for Kafka.
> Initially implementation can target for supporting json and avro message types
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)