cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r467667133



##########
File path: 
contrib/storage-ipfs/src/test/java/org/apache/drill/exec/store/ipfs/TestIPFSGroupScan.java
##########
@@ -0,0 +1,162 @@
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.categories.IPFSStorageTest;
+import org.apache.drill.categories.SlowTest;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheBuilder;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheLoader;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.shaded.guava.com.google.common.io.Resources;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+
+import static 
org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.*;

Review comment:
       The star import is a check-style violation.  

##########
File path: 
contrib/storage-ipfs/src/test/java/org/apache/drill/exec/store/ipfs/IPFSTestSuit.java
##########
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.drill.categories.IPFSStorageTest;
+import org.apache.drill.categories.SlowTest;
+import org.apache.drill.shaded.guava.com.google.common.io.Resources;
+import org.junit.BeforeClass;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.Suite;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+
+@RunWith(Suite.class)
+@Suite.SuiteClasses({TestIPFSQueries.class, TestIPFSGroupScan.class})

Review comment:
       This is missing the scan spec test.

##########
File path: 
contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all 
API calls to be made with POST method.
+ * Upstream issue tracker: 
https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */

Review comment:
       It looks as if 
https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157 has been 
closed.  Can you remove this restriction?

##########
File path: 
contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import 
org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = 
LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") 
IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) 
throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) 
pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : 
columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = 
ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of 
hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once 
the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = 
coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", 
topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, 
ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", 
watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = 
ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a 
UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); 
fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, 
ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", 
minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));

Review comment:
       Here, if `worklist` is null, `worklist.size()` will throw a NPE.

##########
File path: 
contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import 
org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = 
LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") 
IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) 
throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) 
pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : 
columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = 
ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of 
hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)

Review comment:
       I would set these as constants at the top of the class.  Then once 
DRILL-7754 is committed, it's easier to fix this. 

##########
File path: 
contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all 
API calls to be made with POST method.
+ * Upstream issue tracker: 
https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(IPFS ipfs) {
+    this(ipfs.host, ipfs.port);
+  }
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0/", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int 
readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  /**
+   * Resolve names to IPFS CIDs.
+   * See <a 
href="https://docs.ipfs.io/reference/http/api/#api-v0-resolve";>resolve in IPFS 
doc</a>.
+   * @param scheme the scheme of the name to resolve, usually IPFS or IPNS
+   * @param path the path to the object
+   * @param recursive whether recursively resolve names until it is a IPFS CID
+   * @return a Map of JSON object, with the result as the value of key "Path"
+   */
+  public Map resolve(String scheme, String path, boolean recursive) {
+    AtomicReference<Map> ret = new AtomicReference<>();
+    getObjectStream(
+        "resolve?arg=/" + scheme+"/"+path +"&r="+recursive,
+        res -> {
+          ret.set((Map) res);
+          return true;
+        },
+        err -> {
+          throw new RuntimeException(err);
+        }
+    );
+    return ret.get();
+  }
+
+  public class DHT {
+    /**
+     * Find internet addresses of a given peer.
+     * See <a 
href="https://docs.ipfs.io/reference/http/api/#api-v0-dht-findpeer";>dht/findpeer
 in IPFS doc</a>.
+     * @param id the id of the peer to query
+     * @param timeout timeout value in seconds
+     * @param executor executor
+     * @return List of Multiaddresses of the peer
+     */
+    public List<String> findpeerListTimeout(Multihash id, int timeout, 
ExecutorService executor) {
+      AtomicReference<List<String>> ret = new AtomicReference<>();
+      timeLimitedExec(
+          "name/resolve?arg=" + id,
+          timeout,
+          res -> {
+            Map peer = (Map) res;

Review comment:
       Can we specify the parameter?  IE
   ```
   Map<String>
   ``` 
   or whatever the case may be?  My IDE isn't liking this.

##########
File path: 
contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import 
org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = 
LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;

Review comment:
       These three variables can be `final`. 

##########
File path: 
contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import 
org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = 
LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") 
IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) 
throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) 
pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),

Review comment:
       `getPlugin()` is deprecated.  Use `resolve()` instead.

##########
File path: 
contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,325 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.bouncycastle.util.Strings;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+import static 
org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+/**
+ * Helper class with some utilities that are specific to Drill with an IPFS 
storage
+ */
+public class IPFSHelper {
+  private static final Logger logger = 
LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = 
"QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = 
Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private ExecutorService executorService;
+  private final IPFS client;
+  private final IPFSCompat clientCompat;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  public IPFSHelper(IPFS ipfs) {
+    this.client = ipfs;
+    this.clientCompat = new IPFSCompat(ipfs);
+  }
+
+  public IPFSHelper(IPFS ipfs, ExecutorService executorService) {
+    this(ipfs);
+    this.executorService = executorService;
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  /**
+   * Set maximum number of providers per leaf node. The more providers, the 
more time it takes to do DHT queries, while
+   * it is more likely we can find an optimal peer.
+   * @param maxPeersPerLeaf max number of providers to search per leaf node
+   */
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public IPFSCompat getClientCompat() {
+    return clientCompat;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) {
+    List<String> providers;
+    providers = clientCompat.dht.findprovsListTimeout(id, maxPeersPerLeaf, 
timeouts.get(IPFSTimeOut.FIND_PROV), executorService);
+
+    List<Multihash> ret = providers.stream().map(str -> 
Multihash.fromBase58(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = clientCompat.dht.findpeerListTimeout(peerId, 
timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService);
+    List<MultiAddress>
+        ret = addrs
+        .stream()
+        .filter(addr -> !addr.equals(""))
+        .map(str -> new MultiAddress(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public byte[] getObjectDataTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::data, object, 
timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public MerkleNode getObjectLinksTimeout(Multihash object) throws IOException 
{
+    return timedFailure(client.object::links, object, 
timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public IPFSPeer getMyself() throws IOException {
+    if (this.myself != null) {
+      return this.myself;
+    }
+
+    Map res = timedFailure(client::id, timeouts.get(FIND_PEER_INFO));
+    Multihash myID = Multihash.fromBase58((String) res.get("ID"));
+    // Rule out any non-local addresses as they might be NAT-ed external
+    // addresses that are not always reachable from the inside.
+    // But is it safe to assume IPFS always listens on loopback and local 
addresses?
+    List<MultiAddress> myAddrs = ((List<String>) res.get("Addresses"))
+        .stream()
+        .map(addr -> new MultiAddress(addr))
+        .filter(addr -> {
+          try {
+            InetAddress inetAddress = InetAddress.getByName(addr.getHost());
+            return inetAddress.isSiteLocalAddress()
+                || inetAddress.isLinkLocalAddress()
+                || inetAddress.isLoopbackAddress();
+          } catch (UnknownHostException e) {
+            return false;
+          }
+        })
+        .collect(Collectors.toList());
+    this.myself = new IPFSPeer(this, myID, myAddrs);
+
+    return this.myself;
+  }
+
+  public Multihash resolve(String prefix, String path, boolean recursive) {
+    Map<String, String> result = timedFailure(
+        (args) -> clientCompat.resolve((String) args.get(0), (String) 
args.get(1), (boolean) args.get(2)),
+        ImmutableList.<Object>of(prefix, path, recursive),
+        timeouts.get(IPFSTimeOut.FIND_PEER_INFO)
+    );
+    if (!result.containsKey("Path")) {
+      return null;
+    }
+
+    // the path returned is of form /ipfs/Qma...
+    String hashString = result.get("Path").split("/")[2];
+    return Multihash.fromBase58(hashString);
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the 
query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of 
time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> 
op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, 
int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int 
timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed 
out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  /*
+   * DRILL-7753: implement a more advanced algorithm that picks optimal 
addresses. Maybe check reachability, latency
+   * and bandwidth?
+   */
+  /**
+   * Choose a peer's network address from its advertised Multiaddresses.
+   * Prefer globally routable address over local addresses.
+   * @param peerAddrs Multiaddresses obtained from IPFS.DHT.findprovs
+   * @return network address
+   */
+  public static Optional<String> pickPeerHost(List<MultiAddress> peerAddrs) {
+    String localAddr = null;
+    for (MultiAddress addr : peerAddrs) {
+      String host = addr.getHost();
+      try {
+        InetAddress inetAddress = InetAddress.getByName(host);
+        if (inetAddress.isSiteLocalAddress() || 
inetAddress.isLinkLocalAddress()) {
+          localAddr = host;
+        } else {
+          return Optional.of(host);
+        }
+      } catch (UnknownHostException e) {
+        continue;

Review comment:
       `Continue` is not necessary here.   Should we log this?

##########
File path: 
contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import 
org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = 
LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") 
IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) 
throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) 
pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : 
columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = 
ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of 
hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once 
the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = 
coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", 
topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, 
ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", 
watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = 
ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a 
UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); 
fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, 
ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", 
minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, 
ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, 
recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsScanSpec)
+        .field("columns", columns)
+        .toString();
+  }
+
+  private class IPFSWork implements CompleteWork {
+    private EndpointByteMapImpl byteMap = new EndpointByteMapImpl();
+    private Multihash partialRoot;
+    private DrillbitEndpoint onEndpoint = null;
+
+
+    public IPFSWork(String root) {
+      this.partialRoot = Multihash.fromBase58(root);
+    }
+
+    public IPFSWork(Multihash root) {
+      this.partialRoot = root;
+    }
+
+    public Multihash getPartialRootHash() {return partialRoot;}
+
+    public void setOnEndpoint(DrillbitEndpoint endpointAddress) {
+      this.onEndpoint = endpointAddress;
+    }
+
+    @Override
+    public long getTotalBytes() {
+      return DEFAULT_NODE_SIZE;
+    }
+
+    @Override
+    public EndpointByteMap getByteMap() {
+      return byteMap;
+    }
+
+    @Override
+    public int compareTo(CompleteWork o) {

Review comment:
       Is this correct?

##########
File path: 
contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import 
org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = 
LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") 
IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) 
throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) 
pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : 
columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = 
ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of 
hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once 
the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = 
coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", 
topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, 
ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", 
watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = 
ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a 
UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); 
fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, 
ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", 
minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, 
ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, 
recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsScanSpec)
+        .field("columns", columns)
+        .toString();
+  }
+
+  private class IPFSWork implements CompleteWork {
+    private EndpointByteMapImpl byteMap = new EndpointByteMapImpl();
+    private Multihash partialRoot;
+    private DrillbitEndpoint onEndpoint = null;
+
+
+    public IPFSWork(String root) {
+      this.partialRoot = Multihash.fromBase58(root);
+    }
+
+    public IPFSWork(Multihash root) {
+      this.partialRoot = root;
+    }
+
+    public Multihash getPartialRootHash() {return partialRoot;}
+
+    public void setOnEndpoint(DrillbitEndpoint endpointAddress) {
+      this.onEndpoint = endpointAddress;
+    }
+
+    @Override
+    public long getTotalBytes() {
+      return DEFAULT_NODE_SIZE;
+    }
+
+    @Override
+    public EndpointByteMap getByteMap() {
+      return byteMap;
+    }
+
+    @Override
+    public int compareTo(CompleteWork o) {
+      return 0;
+    }
+
+    @Override
+    public String toString() {
+      return "IPFSWork [root = " + partialRoot.toString() + "]";
+    }
+  }
+
+  //DRILL-7756: detect and warn about loops/recursions in case of a malformed 
tree
+  static class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> 
{
+    private final Multihash hash;
+    private final boolean isProvider;
+    private final Map<Multihash, String> ret = new LinkedHashMap<>();
+    private final IPFSPeer myself;
+    private final IPFSHelper helper;
+    private final LoadingCache<Multihash, IPFSPeer> peerCache;
+
+    public IPFSTreeFlattener(Multihash hash, boolean isProvider, IPFSContext 
context) {
+      this(
+        hash,
+        isProvider,
+        context.getMyself(),
+        context.getIPFSHelper(),
+        context.getIPFSPeerCache()
+      );
+    }
+
+    IPFSTreeFlattener(Multihash hash, boolean isProvider, IPFSPeer myself, 
IPFSHelper ipfsHelper, LoadingCache<Multihash, IPFSPeer> peerCache) {
+      this.hash = hash;
+      this.isProvider = isProvider;
+      this.myself = myself;
+      this.helper = ipfsHelper;
+      this.peerCache = peerCache;
+    }
+
+    public IPFSTreeFlattener(IPFSTreeFlattener reference, Multihash hash, 
boolean isProvider) {
+      this(hash, isProvider, reference.myself, reference.helper, 
reference.peerCache);
+    }
+
+    @Override
+    public Map<Multihash, String> compute() {
+      try {
+        if (isProvider) {
+          IPFSPeer peer = peerCache.getUnchecked(hash);
+          ret.put(hash, peer.hasDrillbitAddress() ? 
peer.getDrillbitAddress().get() : null);

Review comment:
       Recommend a `isPresent()` check here to prevent errors.

##########
File path: 
contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import 
org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = 
LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") 
IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) 
throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) 
pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : 
columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = 
ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of 
hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once 
the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = 
coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", 
topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, 
ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", 
watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = 
ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a 
UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); 
fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, 
ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", 
minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, 
ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, 
recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsScanSpec)
+        .field("columns", columns)
+        .toString();
+  }
+
+  private class IPFSWork implements CompleteWork {
+    private EndpointByteMapImpl byteMap = new EndpointByteMapImpl();

Review comment:
       These three variables can be `final`.

##########
File path: 
contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import 
org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = 
LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") 
IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) 
throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) 
pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),

Review comment:
       Also, this method does not throw an `IOException`.  You can remove that 
if you want.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to