[GitHub] drill pull request #865: DRILL-5634 - Add Crypto and Hash Functions

2017-07-13 Thread cgivre
Github user cgivre commented on a diff in the pull request:

https://github.com/apache/drill/pull/865#discussion_r127383405
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CryptoFunctions.java
 ---
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.SecretKeySpec;
+import javax.inject.Inject;
+
+public class CryptoFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(CryptoFunctions.class);
+
+  private CryptoFunctions() {
+  }
+
+  /**
+   * This class returns the md2 digest of a given input string.
+   *  Usage is SELECT md2(  ) FROM ...
+   */
+
+  @FunctionTemplate(name = "md2", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD2Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md2Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * This function returns the MD5 digest of a given input string.
+   *  Usage is shown below:
+   *  select md5( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(name = "md5", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD5Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md5Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * sha() / sha1(): Calculates an SHA-1 160-bit checksum for 
the string, as described in RFC 3174 (Secure Hash Algorithm).
+   * (https://en.wikipedia.org/wiki/SHA-1) The value is returned as a 
string of 40 hexadecimal digits, or NULL if the argument was NULL.
+   * Note that sha() and sha1() are aliases for the same function.
+   *
+   * > select sha1( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(names = {"sha", "sha1"}, scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class SHA1Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+  

[GitHub] drill pull request #865: DRILL-5634 - Add Crypto and Hash Functions

2017-07-13 Thread cgivre
Github user cgivre commented on a diff in the pull request:

https://github.com/apache/drill/pull/865#discussion_r127383370
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CryptoFunctions.java
 ---
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.SecretKeySpec;
+import javax.inject.Inject;
+
+public class CryptoFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(CryptoFunctions.class);
+
+  private CryptoFunctions() {
+  }
+
+  /**
+   * This class returns the md2 digest of a given input string.
+   *  Usage is SELECT md2(  ) FROM ...
+   */
+
+  @FunctionTemplate(name = "md2", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD2Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md2Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * This function returns the MD5 digest of a given input string.
+   *  Usage is shown below:
+   *  select md5( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(name = "md5", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD5Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md5Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * sha() / sha1(): Calculates an SHA-1 160-bit checksum for 
the string, as described in RFC 3174 (Secure Hash Algorithm).
+   * (https://en.wikipedia.org/wiki/SHA-1) The value is returned as a 
string of 40 hexadecimal digits, or NULL if the argument was NULL.
+   * Note that sha() and sha1() are aliases for the same function.
+   *
+   * > select sha1( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(names = {"sha", "sha1"}, scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class SHA1Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+  

[GitHub] drill pull request #865: DRILL-5634 - Add Crypto and Hash Functions

2017-07-13 Thread cgivre
Github user cgivre commented on a diff in the pull request:

https://github.com/apache/drill/pull/865#discussion_r127383363
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CryptoFunctions.java
 ---
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.SecretKeySpec;
+import javax.inject.Inject;
+
+public class CryptoFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(CryptoFunctions.class);
+
+  private CryptoFunctions() {
+  }
+
+  /**
+   * This class returns the md2 digest of a given input string.
+   *  Usage is SELECT md2(  ) FROM ...
+   */
+
+  @FunctionTemplate(name = "md2", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD2Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md2Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * This function returns the MD5 digest of a given input string.
+   *  Usage is shown below:
+   *  select md5( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(name = "md5", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD5Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md5Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * sha() / sha1(): Calculates an SHA-1 160-bit checksum for 
the string, as described in RFC 3174 (Secure Hash Algorithm).
+   * (https://en.wikipedia.org/wiki/SHA-1) The value is returned as a 
string of 40 hexadecimal digits, or NULL if the argument was NULL.
+   * Note that sha() and sha1() are aliases for the same function.
+   *
+   * > select sha1( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(names = {"sha", "sha1"}, scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class SHA1Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+  

[GitHub] drill pull request #875: DRILL-5671 Set secure ACLs (Access Control List) fo...

2017-07-13 Thread bitblender
Github user bitblender commented on a diff in the pull request:

https://github.com/apache/drill/pull/875#discussion_r127367344
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/coord/zk/ZKSecureACLProvider.java
 ---
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.coord.zk;
+
+import com.google.common.collect.ImmutableList;
+import org.apache.curator.framework.api.ACLProvider;
+import org.apache.zookeeper.ZooDefs.Ids;
+import org.apache.zookeeper.data.ACL;
+
+import java.util.List;
+
+/**
+ * ZKSecureACLProvider restricts access to znodes created by Drill
+ * The cluster discovery znode i.e. the znode containing the list of 
Drillbits is
+ * readable by anyone.
+ * For all other znodes only the creator of the znode, i.e the Drillbit 
user, has full access.
+ */
+
+public class ZKSecureACLProvider implements ACLProvider {
+
+static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(ZKSecureACLProvider.class);
+// Creator has full access
+static ImmutableList DEFAULT_ACL = new 
ImmutableList.Builder()
+  
.addAll(Ids.CREATOR_ALL_ACL.iterator())
+  .build();
+// Creator has full access
+// Everyone else has only read access
+static ImmutableList DRILL_CLUSTER_ACL = new 
ImmutableList.Builder()
--- End diff --

Done


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #875: DRILL-5671 Set secure ACLs (Access Control List) fo...

2017-07-13 Thread bitblender
Github user bitblender commented on a diff in the pull request:

https://github.com/apache/drill/pull/875#discussion_r127367354
  
--- Diff: exec/java-exec/src/main/resources/drill-module.conf ---
@@ -112,7 +112,8 @@ drill.exec: {
 retry: {
   count: 7200,
   delay: 500
-}
+},
+use.secure_acl: false
--- End diff --

Opened https://maprdrill.atlassian.net/browse/MD-2278


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread paul-rogers
Github user paul-rogers commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127365973
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
 ---
@@ -723,19 +723,20 @@ private void init(MetadataContext metaContext) throws 
IOException {
 // if querying a single file we can look up the metadata directly 
from the file
 metaPath = new Path(p, Metadata.METADATA_FILENAME);
   }
-  if (metaPath != null && fs.exists(metaPath)) {
+  if (metaPath != null && fs.exists(metaPath) && 
Metadata.MetadataVersion.isVersionSupported(fs, metaPath)) {
 usedMetadataCache = true;
-parquetTableMetadata = Metadata.readBlockMeta(fs, 
metaPath.toString(), metaContext, formatConfig);
+parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath, 
metaContext, formatConfig);
   } else {
 parquetTableMetadata = Metadata.getParquetTableMetadata(fs, 
p.toString(), formatConfig);
   }
 } else {
   Path p = Path.getPathWithoutSchemeAndAuthority(new 
Path(selectionRoot));
   metaPath = new Path(p, Metadata.METADATA_FILENAME);
-  if (fs.isDirectory(new Path(selectionRoot)) && fs.exists(metaPath)) {
+  if (fs.isDirectory(new Path(selectionRoot)) && fs.exists(metaPath)
--- End diff --

Can we factor out this code so that we do the checks in only one place? 
Something like a `boolean loadMetadata(...)` method that does all the checks 
mentioned above?

In general, repeated code = bad. Somebody has to test this code multiple 
times if it is repeated. Do the unit tests do so?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread paul-rogers
Github user paul-rogers commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127364108
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -658,11 +657,21 @@ private boolean tableModified(List 
directories, Path metaFilePath,
 return false;
   }
 
+  /**
+   * Basic class for parquet metadata. Inheritors of this class are json 
serializable structures of
+   * different versions metadata cache files.
+   *
--- End diff --

Sorry, nit. This is Javadoc and is HTML formatted. Please insert  to 
separate paragraphs.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread paul-rogers
Github user paul-rogers commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127363955
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -557,7 +556,7 @@ private void readBlockMeta(String path,
 mapper.registerModule(serialModule);
 mapper.registerModule(module);
 mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, 
false);
-FSDataInputStream is = fs.open(p);
+FSDataInputStream is = fs.open(path);
--- End diff --

Common practice is to enclose this in a try-with-resources block, assuming 
that this stream derives from `AutoCloseable`. Else, we should add our own 
`try` block and close the file in `finally`.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread paul-rogers
Github user paul-rogers commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127364840
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -1851,9 +1860,81 @@ private static String relativize(String baseDir, 
String childPath) {
   .relativize(fullPathWithoutSchemeAndAuthority.toUri()));
   if (relativeFilePath.isAbsolute()) {
 throw new IllegalStateException(String.format("Path %s is not a 
subpath of %s.",
-basePathWithoutSchemeAndAuthority.toUri().toString(), 
fullPathWithoutSchemeAndAuthority.toUri().toString()));
+basePathWithoutSchemeAndAuthority.toUri().getPath(), 
fullPathWithoutSchemeAndAuthority.toUri().getPath()));
+  }
+  return relativeFilePath.toUri().getPath();
+}
+  }
+
+  /**
+   * Used to identify metadata version by the deserialization 
"metadata_version" first property
+   * from the metadata cache file
+   */
+  public static class MetadataVersion {
+@JsonProperty("metadata_version")
+public String textVersion;
+
+/**
+ * Supported metadata versions.
+ * Note: keep them synchronized with {@link ParquetTableMetadataBase} 
versions
+ */
+enum Versions {
+  v1(Constants.V1),
+  v2(Constants.V2),
+  v3(Constants.V3),
+  v3_1(Constants.V3_1);
+
+  private final String version;
+
+  Versions(String version) {
+this.version = version;
+  }
+
+  public String getVersion() {
+return version;
+  }
+
+  public static Versions fromString(String version) {
+for (Versions v : Versions.values()) {
+  if (v.version.equalsIgnoreCase(version)) {
+return v;
+  }
+}
+return null;
+  }
+
+  public static class Constants {
+public static final String V1 = "v1";
+public static final String V2 = "v2";
+public static final String V3 = "v3";
+public static final String V3_1 = "v3_1";
+  }
+}
+
+/**
--- End diff --

One very handy thing to do for each version constant is to list what 
changed, possibly including the JIRA number for more information:

```
/**
 * Version 3.1: Changes the xyz property.
 * File names stored as relative paths.
 * See DRILL-1234.
 */
```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread paul-rogers
Github user paul-rogers commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127365808
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
 ---
@@ -723,19 +723,20 @@ private void init(MetadataContext metaContext) throws 
IOException {
 // if querying a single file we can look up the metadata directly 
from the file
 metaPath = new Path(p, Metadata.METADATA_FILENAME);
   }
-  if (metaPath != null && fs.exists(metaPath)) {
+  if (metaPath != null && fs.exists(metaPath) && 
Metadata.MetadataVersion.isVersionSupported(fs, metaPath)) {
--- End diff --

Nit. No need to check `fs.exists` here as we, in effect, check this when 
trying to open the file. Handle the file not found exception instead.

Note that Drill is a multi-user system. Even with the `exists` check, there 
is still a race condition where the file can exist during the check, but be 
deleted by the time we try to open the file. So, since we have to check on open 
anyway, no need to check here.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread paul-rogers
Github user paul-rogers commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127365148
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -1851,9 +1860,81 @@ private static String relativize(String baseDir, 
String childPath) {
   .relativize(fullPathWithoutSchemeAndAuthority.toUri()));
   if (relativeFilePath.isAbsolute()) {
 throw new IllegalStateException(String.format("Path %s is not a 
subpath of %s.",
-basePathWithoutSchemeAndAuthority.toUri().toString(), 
fullPathWithoutSchemeAndAuthority.toUri().toString()));
+basePathWithoutSchemeAndAuthority.toUri().getPath(), 
fullPathWithoutSchemeAndAuthority.toUri().getPath()));
+  }
+  return relativeFilePath.toUri().getPath();
+}
+  }
+
+  /**
+   * Used to identify metadata version by the deserialization 
"metadata_version" first property
+   * from the metadata cache file
+   */
+  public static class MetadataVersion {
+@JsonProperty("metadata_version")
+public String textVersion;
+
+/**
+ * Supported metadata versions.
+ * Note: keep them synchronized with {@link ParquetTableMetadataBase} 
versions
+ */
+enum Versions {
+  v1(Constants.V1),
+  v2(Constants.V2),
+  v3(Constants.V3),
+  v3_1(Constants.V3_1);
+
+  private final String version;
+
+  Versions(String version) {
+this.version = version;
+  }
+
+  public String getVersion() {
+return version;
+  }
+
+  public static Versions fromString(String version) {
+for (Versions v : Versions.values()) {
+  if (v.version.equalsIgnoreCase(version)) {
+return v;
+  }
+}
+return null;
+  }
+
+  public static class Constants {
+public static final String V1 = "v1";
+public static final String V2 = "v2";
+public static final String V3 = "v3";
+public static final String V3_1 = "v3_1";
+  }
+}
+
+/**
+ * @param fs current file system
+ * @param path of metadata cache file
+ * @return true if metadata version is supported, false otherwise
+ * @throws IOException if parquet metadata can't be deserialized from 
the json file
+ */
+public static boolean isVersionSupported(FileSystem fs, Path path) 
throws IOException {
--- End diff --

Not now, but in the future, separate the Jackson-serialized persistence 
class from the code which works with that data. The persistence class just has 
the fields for the data and simple related operations. Code that works with the 
data should be in a separate class.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread paul-rogers
Github user paul-rogers commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127366251
  
--- Diff: 
exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetMetadataCache.java
 ---
@@ -446,10 +447,25 @@ public void testMetadataCacheAbsolutePaths() throws 
Exception {
 }
   }
 
+  @Test
--- End diff --

Tests for the version checks? A bit tricky because you have to create a 
file and "predict" the next version.

For this, you need a way to get the current version from your `Version` 
enum. Then do some simple logic to try:

* Add .1 to version number: 3.1 becomes 3.2.
* Move to next whole version number: 3.1 becomes 4.

Then, create a file with that version and check how this version of the 
code handles a future version of the file.

To be thorough, also create a file that contains a field that Jackson 
cannot deserialize. Be sure we properly handle a "hard" version incompatibility 
as discussed earlier.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread paul-rogers
Github user paul-rogers commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127365605
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -1851,9 +1860,81 @@ private static String relativize(String baseDir, 
String childPath) {
   .relativize(fullPathWithoutSchemeAndAuthority.toUri()));
   if (relativeFilePath.isAbsolute()) {
 throw new IllegalStateException(String.format("Path %s is not a 
subpath of %s.",
-basePathWithoutSchemeAndAuthority.toUri().toString(), 
fullPathWithoutSchemeAndAuthority.toUri().toString()));
+basePathWithoutSchemeAndAuthority.toUri().getPath(), 
fullPathWithoutSchemeAndAuthority.toUri().getPath()));
+  }
+  return relativeFilePath.toUri().getPath();
+}
+  }
+
+  /**
+   * Used to identify metadata version by the deserialization 
"metadata_version" first property
+   * from the metadata cache file
+   */
+  public static class MetadataVersion {
+@JsonProperty("metadata_version")
+public String textVersion;
+
+/**
+ * Supported metadata versions.
+ * Note: keep them synchronized with {@link ParquetTableMetadataBase} 
versions
+ */
+enum Versions {
+  v1(Constants.V1),
+  v2(Constants.V2),
+  v3(Constants.V3),
+  v3_1(Constants.V3_1);
+
+  private final String version;
+
+  Versions(String version) {
+this.version = version;
+  }
+
+  public String getVersion() {
+return version;
+  }
+
+  public static Versions fromString(String version) {
+for (Versions v : Versions.values()) {
+  if (v.version.equalsIgnoreCase(version)) {
+return v;
+  }
+}
+return null;
+  }
+
+  public static class Constants {
+public static final String V1 = "v1";
+public static final String V2 = "v2";
+public static final String V3 = "v3";
+public static final String V3_1 = "v3_1";
+  }
+}
+
+/**
+ * @param fs current file system
+ * @param path of metadata cache file
+ * @return true if metadata version is supported, false otherwise
+ * @throws IOException if parquet metadata can't be deserialized from 
the json file
+ */
+public static boolean isVersionSupported(FileSystem fs, Path path) 
throws IOException {
--- End diff --

Here, we can see the reason for the separation. We now open each file 
twice: once to check the version, another time to deserialize if the version is 
OK. Better to just deserialize the file. There would be two cases.

* Minor change: the current deserializer can read the file. (This is the 
case for file version <= code version.) Can also be the case, as here, when the 
file version bumps without adding new fields.
* Major change: the deserialization fails with a Jackson exception. This 
tells us we cannot read the file because we don't recognize the format. This 
should only be the case when file version > code version.

In either case, we can attempt to deserialize the file:

* Deserialize file.
* If error occurs, we don't support the file format.
* If OK, but file version is newer than code version, we don't support.
* If OK, and file version = code version, this is the normal path.
* If OK, but file version < code version, then some special fix-up may be 
needed to convert the deserialized data to current format.

Given this, we don't really need a zillion version classes. We have one 
class that handles the logic for the current version. We have a deserializer 
class that handles the above (including any needed data updates.) And, we have 
the serialized data class.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127360560
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/SpillSet.java
 ---
@@ -413,7 +419,12 @@ public SpillSet(DrillConfig config, FragmentHandle 
handle, PhysicalOperator popC
   fileManager = new HadoopFileManager(spillFs);
 }
 
-spillDirName = String.format("%s_%s_%s-%s_minor%s", 
QueryIdHelper.getQueryId(handle.getQueryId()),
+// If provided with a prefix to identify the Drillbit, prepend that to 
the
+// spill directory.
+
+spillDirName = String.format("%s%s_%s_%s-%s-%s",
+ep == null ? "" : ep.getAddress() + "-" + ep.getUserPort() + "/",
--- End diff --

Any chance the address would be "" ?  Then the directory name would start 
with a "-" which could cause some trouble (probably only for access from the 
shell, thus OK).



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127358527
  
--- Diff: 
exec/vector/src/main/java/org/apache/drill/exec/vector/AllocationHelper.java ---
@@ -26,14 +26,16 @@ public static void allocate(ValueVector v, int 
valueCount, int bytesPerValue) {
 allocate(v, valueCount, bytesPerValue, 5);
   }
 
-  public static void allocatePrecomputedChildCount(ValueVector v, int 
valueCount, int bytesPerValue, int childValCount){
-if(v instanceof FixedWidthVector) {
+  public static void allocatePrecomputedChildCount(ValueVector v, int 
valueCount, int bytesPerValue, int childValCount) {
--- End diff --

Discouraged: single letter parameter/field names. Even **"vv"** would be 
easier to search, if needed.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127357111
  
--- Diff: 
exec/java-exec/src/test/java/org/apache/drill/test/ClientFixture.java ---
@@ -231,4 +236,120 @@ public void setControls(String controls) {
   public RowSetBuilder rowSetBuilder(BatchSchema schema) {
 return new RowSetBuilder(allocator(), schema);
   }
+
+  /**
+   * Very simple parser for semi-colon separated lists of SQL statements 
which
+   * handles quoted semicolons. Drill can execute only one statement at a 
time
+   * (without a trailing semi-colon.) This parser breaks up a statement 
list
+   * into single statements. Input:
+   * USE a.b;
+   * ALTER SESSION SET `foo` = ";";
+   * SELECT * FROM bar WHERE x = "\";";
+   * Output:
+   * 
+   * USE a.b
+   * ALTER SESSION SET `foo` = ";"
+   * SELECT * FROM bar WHERE x = "\";"
+   */
+
+  public static class StatementParser {
+private final Reader in;
+private StringBuilder buf;
+
+public StatementParser(Reader in) {
+  this.in = in;
+}
+
+public String parseNext() throws IOException {
+  boolean eof = false;
+  buf = new StringBuilder();
+  outer:
+  for (;;) {
+int c = in.read();
+if (c == -1) {
+  eof = true;
+  break;
+}
+if (c == ';') {
+  break;
+}
+buf.append((char) c);
+if (c == '"' || c == '\'' || c == '`') {
+  int quote = c;
+  boolean escape = false;
+  for (;;) {
+c = in.read();
+if (c == -1) {
+  break outer;
--- End diff --

Note: This code would return a truncated SQL Statement (non matched quote), 
as appears in the input.
Option: Throw some user exception here.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127351398
  
--- Diff: exec/java-exec/src/main/resources/drill-module.conf ---
@@ -239,9 +239,7 @@ drill.exec: {
 external: {
   // Drill uses the managed External Sort Batch by default.
   // Set this to true to use the legacy, unmanaged version.
-  // Disabled in the intial commit, to be enabled after
-  // tests are committed.
-  disable_managed: true,
+  disable_managed: false,
--- End diff --

Back to "true" 



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[jira] [Resolved] (DRILL-5445) Assertion Error in Managed External Sort when dealing with repeated maps

2017-07-13 Thread Paul Rogers (JIRA)

 [ 
https://issues.apache.org/jira/browse/DRILL-5445?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Paul Rogers resolved DRILL-5445.

Resolution: Done

> Assertion Error in Managed External Sort when dealing with repeated maps
> 
>
> Key: DRILL-5445
> URL: https://issues.apache.org/jira/browse/DRILL-5445
> Project: Apache Drill
>  Issue Type: Bug
>Affects Versions: 1.10.0
>Reporter: Rahul Challapalli
>Assignee: Paul Rogers
> Attachments: 27004a3c-c53d-52d1-c7ed-4beb563447f9.sys.drill, 
> drillbit.log
>
>
> git.commit.id.abbrev=3e8b01d
> The below query fails with an Assertion Error (I am running with assertions 
> enabled)
> {code}
> ALTER SESSION SET `exec.sort.disable_managed` = false;
> alter session set `planner.width.max_per_node` = 1;
> alter session set `planner.disable_exchanges` = true;
> alter session set `planner.width.max_per_query` = 1;
> alter session set `planner.memory.max_query_memory_per_node` = 152428800;
> select count(*) from (
> select * from (
> select event_info.uid, transaction_info.trans_id, event_info.event.evnt_id
> from (
>  select userinfo.transaction.trans_id trans_id, 
> max(userinfo.event.event_time) max_event_time
>  from (
>  select uid, flatten(events) event, flatten(transactions) transaction 
> from dfs.`/drill/testdata/resource-manager/nested-large.json`
>  ) userinfo
>  where userinfo.transaction.trans_time >= userinfo.event.event_time
>  group by userinfo.transaction.trans_id
> ) transaction_info
> inner join
> (
>  select uid, flatten(events) event
>  from dfs.`/drill/testdata/resource-manager/nested-large.json`
> ) event_info
> on transaction_info.max_event_time = event_info.event.event_time) d order by 
> features[0].type) d1 where d1.uid < -1;
> {code}
> Below is the error from the logs
> {code}
> [Error Id: 26983344-dee3-4a33-8508-ad125f01fee6 on qa-node190.qa.lab:31010]
> at 
> org.apache.drill.common.exceptions.UserException$Builder.build(UserException.java:544)
>  ~[drill-common-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> org.apache.drill.exec.work.fragment.FragmentExecutor.sendFinalState(FragmentExecutor.java:295)
>  [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> org.apache.drill.exec.work.fragment.FragmentExecutor.cleanup(FragmentExecutor.java:160)
>  [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> org.apache.drill.exec.work.fragment.FragmentExecutor.run(FragmentExecutor.java:264)
>  [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> org.apache.drill.common.SelfCleaningRunnable.run(SelfCleaningRunnable.java:38)
>  [drill-common-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>  [na:1.7.0_111]
> at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>  [na:1.7.0_111]
> at java.lang.Thread.run(Thread.java:745) [na:1.7.0_111]
> Caused by: java.lang.RuntimeException: java.lang.AssertionError
> at 
> org.apache.drill.common.DeferredException.addThrowable(DeferredException.java:101)
>  ~[drill-common-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> org.apache.drill.exec.work.fragment.FragmentExecutor.fail(FragmentExecutor.java:409)
>  [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> org.apache.drill.exec.work.fragment.FragmentExecutor.run(FragmentExecutor.java:250)
>  [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> ... 4 common frames omitted
> Caused by: java.lang.AssertionError: null
> at 
> org.apache.drill.exec.vector.complex.RepeatedMapVector.load(RepeatedMapVector.java:444)
>  ~[vector-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> org.apache.drill.exec.cache.VectorAccessibleSerializable.readFromStream(VectorAccessibleSerializable.java:118)
>  ~[drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> org.apache.drill.exec.physical.impl.xsort.managed.BatchGroup$SpilledRun.getBatch(BatchGroup.java:222)
>  ~[drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> org.apache.drill.exec.physical.impl.xsort.managed.BatchGroup$SpilledRun.getNextIndex(BatchGroup.java:196)
>  ~[drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> org.apache.drill.exec.test.generated.PriorityQueueCopierGen23.setup(PriorityQueueCopierTemplate.java:60)
>  ~[na:na]
> at 
> org.apache.drill.exec.physical.impl.xsort.managed.CopierHolder.createCopier(CopierHolder.java:116)
>  ~[drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> org.apache.drill.exec.physical.impl.xsort.managed.CopierHolder.access$200(CopierHolder.java:45)
>  ~[drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at 
> 

[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127345445
  
--- Diff: 
exec/vector/src/main/java/org/apache/drill/exec/vector/complex/BaseRepeatedValueVector.java
 ---
@@ -210,13 +212,15 @@ protected void replaceDataVector(ValueVector v) {
   }
 
   @Override
-  public int getAllocatedByteCount() {
-return offsets.getAllocatedByteCount() + 
vector.getAllocatedByteCount();
+  public void getLedgers(Set ledgers) {
--- End diff --

This method looks more like a "set" than a "get" .



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127071648
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/record/SmartAllocationHelper.java
 ---
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.record;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.drill.exec.vector.AllocationHelper;
+import org.apache.drill.exec.vector.ValueVector;
+import org.apache.drill.exec.vector.complex.AbstractMapVector;
+import org.apache.drill.exec.vector.complex.RepeatedMapVector;
+
+/**
+ * Prototype mechanism to allocate vectors based on expected
+ * data sizes. This version uses a name-based map of fields
+ * to sizes. Better to represent the batch structurally and
+ * simply iterate over the schema rather than doing a per-field
+ * lookup. But, the mechanisms needed to do the efficient solution
+ * don't exist yet.
+ */
+
+public class SmartAllocationHelper {
+
+  public static class AllocationHint {
+public final int entryWidth;
+public final int elementCount;
+
+private AllocationHint(int width, int elements) {
+  entryWidth = width;
+  elementCount = elements;
+}
+
+@Override
+public String toString() {
+  StringBuilder buf = new StringBuilder()
+  .append("{");
+  boolean comma = false;
+  if (entryWidth > 0) {
+buf.append("width=")
+   .append(entryWidth);
+comma = true;
+  }
+  if (elementCount > 0) {
+if (comma) {
+  buf.append(", ");
+}
+buf.append("elements=")
+.append(elementCount);
+  }
+  buf.append("}");
+  return buf.toString();
+}
+  }
+
+  private Map hints = new HashMap<>();
+
+  public void variableWidth(String name, int width) {
+hints.put(name, new AllocationHint(width, 1));
+  }
+
+  public void fixedWidthArray(String name, int elements) {
+hints.put(name, new AllocationHint(0, elements));
+  }
+
+  public void variableWidthArray(String name, int width, int elements) {
+hints.put(name, new AllocationHint(width, elements));
+  }
+
+  public void allocateBatch(VectorAccessible va, int recordCount) {
+for (VectorWrapper w: va) {
+  allocateVector(w.getValueVector(), "", recordCount);
+}
+  }
+
+  private void allocateVector(ValueVector vector, String prefix, int 
recordCount) {
--- End diff --

Can this method be renamed to **allocateVectorOrMap()** ?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124132387
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/PriorityQueueCopierWrapper.java
 ---
@@ -245,29 +250,35 @@ private BatchMerger(PriorityQueueCopierWrapper 
holder, BatchSchema schema, List<
 
 @Override
 public boolean next() {
-  Stopwatch w = Stopwatch.createStarted();
   long start = holder.getAllocator().getAllocatedMemory();
+
+  // Allocate an outgoing container the "dumb" way (based on static 
sizes)
+  // for testing, or the "smart" way (based on actual observed data 
sizes)
+  // for production code.
+
+  if (allocHelper == null) {
+VectorAccessibleUtilities.allocateVectors(outputContainer, 
targetRecordCount);
+  } else {
+allocHelper.allocateBatch(outputContainer, targetRecordCount);
+  }
+  logger.trace("Initial output batch allocation: {} bytes",
+   holder.getAllocator().getAllocatedMemory() - start);
+  Stopwatch w = Stopwatch.createStarted();
   int count = holder.copier.next(targetRecordCount);
-  copyCount += count;
   if (count > 0) {
 long t = w.elapsed(TimeUnit.MICROSECONDS);
 batchCount++;
-logger.trace("Took {} us to merge {} records", t, count);
 long size = holder.getAllocator().getAllocatedMemory() - start;
+logger.trace("Took {} us to merge {} records, consuming {} bytes 
of memory",
+ t, count, size);
 estBatchSize = Math.max(estBatchSize, size);
   } else {
 logger.trace("copier returned 0 records");
   }
 
-  // Identify the schema to be used in the output container. (Since
-  // all merged batches have the same schema, the schema we identify
-  // here should be the same as that which we already had.
+  // Initialize output container metadata.
--- End diff --

Why remove the original comments ?  They still look valid.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124904394
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/SortMemoryManager.java
 ---
@@ -391,84 +601,93 @@ public MergeTask(MergeAction action, int count) {
 
   public MergeTask consolidateBatches(long allocMemory, int inMemCount, 
int spilledRunsCount) {
--- End diff --

Add JavaDoc for consolidateBatches() 



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124430031
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/SortMemoryManager.java
 ---
@@ -19,7 +19,125 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
+/**
+ * Computes the memory needs for input batches, spill batches and merge
+ * batches. The key challenges that this code tries to overcome are:
+ * 
+ * Drill is not designed for the small memory allocations,
+ * but the planner may provide such allocations because the memory per
+ * query is divided among slices (minor fragments) and among buffering
+ * operators, leaving very little per operator.
+ * Drill does not provide the detailed memory information needed to
+ * carefully manage memory in tight constraints.
+ * But, Drill has a death penalty for going over the memory limit.
+ * 
+ * As a result, this class is a bit of a hack: it attempt to consider a
+ * number of ill-defined factors in order to divide up memory use in a
+ * way that prevents OOM errors.
+ * 
+ * First, it is necessary to differentiate two concepts:
+ * 
+ * The data size of a batch: the amount of memory needed to hold
+ * the data itself. The data size is constant for any given batch.
+ * The buffer size of the buffers that hold the data. The buffer
+ * size varies wildly depending on how the batch was produced.
+ * 
+ * The three kinds of buffer layouts seen to date include:
+ * 
+ * One buffer per vector component (data, offsets, null flags, etc.)
+ *  create by readers, project and other operators.
+ * One buffer for the entire batch, with each vector component using
+ * a slice of the overall buffer.  case for batches deserialized 
from
+ * exchanges.
+ * One buffer for each top-level vector, with component vectors
+ * using slices of the overall vector buffer  the result of reading
+ * spilled batches from disk.
+ * 
+ * In each case, buffer sizes are power-of-two rounded from the data size.
+ * But since the data is grouped differently in each case, the resulting 
buffer
+ * sizes vary considerably.
+ * 
+ * As a result, we can never be sure of the amount of memory needed for a
+ * batch. So, we have to estimate based on a number of factors:
+ * 
+ * Uses the {@link RecordBatchSizer} to estimate the data size and
+ * buffer size of each incoming batch.
+ * Estimates the internal fragmentation due to power-of-two 
rounding.
+ * Configured preferences for spill and output batches.
+ * 
+ * The code handles "normal" and "low" memory conditions.
+ * 
+ * In normal memory, we simply work out the number of preferred-size
+ * batches fit in memory (based on the predicted buffer size.)
+ * In low memory, we divide up the available memory to produce the
+ * spill and merge batch sizes. The sizes will be less than the configured
+ * preference.
+ * 
+ */
+
 public class SortMemoryManager {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(ExternalSortBatch.class);
+
+  /**
+   * Estimate for typical internal fragmentation in a buffer due to 
power-of-two
+   * rounding on vectors.
+   * 
+   * 
+   * [|__$__]
+   * In the above, the brackets represent the whole vector. The
+   * first half is always full. When the first half filled, the second
+   * half was allocated. On average, the second half will be half full.
+   * This means that, on average, 1/4 of the allocated space is
+   * unused (the definition of internal fragmentation.)
+   */
+
+  public static final double INTERNAL_FRAGMENTATION_ESTIMATE = 1.0/4.0;
+
+  /**
+   * Given a buffer, this is the assumed amount of space
+   * available for data. (Adding more will double the buffer
+   * size half the time.)
+   */
+
+  public static final double PAYLOAD_MULTIPLIER = 1 - 
INTERNAL_FRAGMENTATION_ESTIMATE;
+
+  /**
+   * Given a data size, this is the multiplier to create the buffer
+   * size estimate. (Note: since we work with aggregate batches, we
+   * cannot simply round up to the next power of two: rounding is done
+   * on a vector-by-vector basis. Here we need to estimate the aggregate
+   * effect of rounding.
+   */
+
+  public static final double BUFFER_MULTIPLIER = 1/PAYLOAD_MULTIPLIER;
+  public static final double WORST_CASE_MULTIPLIER = 2.0;
+
+  public static final int MIN_ROWS_PER_SORT_BATCH = 100;
--- End diff --

Possibly make the min rows a configurable option, so could be tested with 
different values.


---
If your project is set up for it, you can reply to this email and have your
reply appear 

[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127067063
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/record/SmartAllocationHelper.java
 ---
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.record;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.drill.exec.vector.AllocationHelper;
+import org.apache.drill.exec.vector.ValueVector;
+import org.apache.drill.exec.vector.complex.AbstractMapVector;
+import org.apache.drill.exec.vector.complex.RepeatedMapVector;
+
+/**
+ * Prototype mechanism to allocate vectors based on expected
+ * data sizes. This version uses a name-based map of fields
+ * to sizes. Better to represent the batch structurally and
+ * simply iterate over the schema rather than doing a per-field
+ * lookup. But, the mechanisms needed to do the efficient solution
+ * don't exist yet.
+ */
+
+public class SmartAllocationHelper {
+
+  public static class AllocationHint {
+public final int entryWidth;
+public final int elementCount;
+
+private AllocationHint(int width, int elements) {
+  entryWidth = width;
+  elementCount = elements;
+}
+
+@Override
+public String toString() {
+  StringBuilder buf = new StringBuilder()
--- End diff --

This whole method can be written in only 4 lines of code \:

if ( entryWidth > 0 && elementCount > 0 ) { return 
String.format("{width=%d, elements=%d}",entryWidth,entryCount); }
if ( entryWidth > 0 ) { return String.format("{width=%d}",entryWidth); }
if ( elementCount > 0 ) { return String.format("{elements=%d}",entryCount); 
}
return String.format("{}");

or

String ew = ( entryWidth > 0 ) ? String.format("width=%d",entryWidth) : "";
String comma = ( entryWidth > 0 && elementCount > 0 ) ? ", " : "";
String ec = ( elementCount > 0 ) ? String.format("elements=%d",entryCount) 
: "";
return "{" + ew + comma + ec + "}" ;

The above can even be pushed into a single line !!! A la APL :-) 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127345692
  
--- Diff: 
exec/vector/src/main/java/org/apache/drill/exec/vector/complex/ListVector.java 
---
@@ -323,12 +325,15 @@ public void setValueCount(int valueCount) {
   }
 
   @Override
-  public int getAllocatedByteCount() {
-return offsets.getAllocatedByteCount() + bits.getAllocatedByteCount() 
+ super.getAllocatedByteCount();
+  public void getLedgers(Set ledgers) {
--- End diff --

Same comment - set vs. get



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124909795
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/SorterWrapper.java
 ---
@@ -83,10 +83,9 @@ private SingleBatchSorter newSorter(VectorAccessible 
batch) {
 ClassGenerator g = cg.getRoot();
 cg.plainJavaCapable(true);
 // Uncomment out this line to debug the generated code.
-  cg.saveCodeForDebugging(true);
+cg.saveCodeForDebugging(true);
--- End diff --

comment out  ..


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127344297
  
--- Diff: exec/vector/src/main/codegen/templates/VariableLengthVectors.java 
---
@@ -247,27 +249,26 @@ public void copyEntry(int toIndex, ValueVector from, 
int fromIndex) {
   }
 
   @Override
-  public int getAllocatedByteCount() {
-return offsetVector.getAllocatedByteCount() + 
super.getAllocatedByteCount();
+  public void getLedgers(Set ledgers) {
+offsetVector.getLedgers(ledgers);
+super.getLedgers(ledgers);
   }
 
   @Override
-  public int getPayloadByteCount() {
-UInt${type.width}Vector.Accessor a = offsetVector.getAccessor();
-int count = a.getValueCount();
-if (count == 0) {
+  public int getPayloadByteCount(int valueCount) {
+if (valueCount == 0) {
   return 0;
-} else {
-  // If 1 or more values, then the last value is set to
-  // the offset of the next value, which is the same as
-  // the length of existing values.
-  // In addition to the actual data bytes, we must also
-  // include the "overhead" bytes: the offset vector entries
-  // that accompany each column value. Thus, total payload
-  // size is consumed text bytes + consumed offset vector
-  // bytes.
-  return a.get(count-1) + offsetVector.getPayloadByteCount();
 }
+// If 1 or more values, then the last value is set to
+// the offset of the next value, which is the same as
+// the length of existing values.
+// In addition to the actual data bytes, we must also
+// include the "overhead" bytes: the offset vector entries
+// that accompany each column value. Thus, total payload
+// size is consumed text bytes + consumed offset vector
+// bytes.
+return offsetVector.getAccessor().get(valueCount) +
--- End diff --

Should this be **(valueCount - 1)** ??



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127346284
  
--- Diff: 
exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedMapVector.java
 ---
@@ -598,7 +604,8 @@ public void clear() {
   }
 
   @Override
-  public int getAllocatedByteCount() {
-return super.getAllocatedByteCount( ) + 
offsets.getAllocatedByteCount();
+  public void getLedgers(Set ledgers) {
--- End diff --

again 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127077314
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/PriorityQueueCopierWrapper.java
 ---
@@ -100,10 +102,12 @@ private PriorityQueueCopier 
newCopier(VectorAccessible batch) {
* @param batchGroupList
* @param outputContainer
* @param targetRecordCount
+   * @param allocHelper
* @return
*/
-  public BatchMerger startMerge(BatchSchema schema, List batchGroupList, VectorContainer outputContainer, int 
targetRecordCount) {
-return new BatchMerger(this, schema, batchGroupList, outputContainer, 
targetRecordCount);
+  public BatchMerger startMerge(BatchSchema schema, List batchGroupList,
--- End diff --

How about creating another **startMerge(...)** without the last 
**allocHelper** parameter, which would then call **BatchMerger(..., null)** 
; this new overloaded one would be used in places like TestCopier.java, instead 
of adding a last null parameter there.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r123834281
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
 ---
@@ -19,117 +19,162 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;
 
+import org.apache.drill.common.types.TypeProtos.DataMode;
 import org.apache.drill.common.types.TypeProtos.MinorType;
 import org.apache.drill.exec.expr.TypeHelper;
+import org.apache.drill.exec.memory.AllocationManager.BufferLedger;
 import org.apache.drill.exec.memory.BaseAllocator;
 import org.apache.drill.exec.record.BatchSchema;
 import org.apache.drill.exec.record.MaterializedField;
 import org.apache.drill.exec.record.RecordBatch;
+import org.apache.drill.exec.record.SmartAllocationHelper;
 import org.apache.drill.exec.record.VectorAccessible;
 import org.apache.drill.exec.record.VectorWrapper;
 import org.apache.drill.exec.record.selection.SelectionVector2;
+import org.apache.drill.exec.vector.UInt4Vector;
 import org.apache.drill.exec.vector.ValueVector;
 import org.apache.drill.exec.vector.complex.AbstractMapVector;
+import org.apache.drill.exec.vector.complex.RepeatedValueVector;
 import org.apache.drill.exec.vector.VariableWidthVector;
 
+import com.google.common.collect.Sets;
+
 /**
  * Given a record batch or vector container, determines the actual memory
  * consumed by each column, the average row, and the entire record batch.
  */
 
 public class RecordBatchSizer {
-//  private static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(RecordBatchSizer.class);
 
   /**
* Column size information.
*/
   public static class ColumnSize {
+public final String prefix;
 public final MaterializedField metadata;
 
 /**
- * Assumed size from Drill metadata.
+ * Assumed size from Drill metadata. Note that this information is
+ * 100% bogus. Do not use it.
  */
 
+@Deprecated
 public int stdSize;
 
 /**
- * Actual memory consumed by all the vectors associated with this 
column.
- */
-
-public int totalSize;
-
-/**
  * Actual average column width as determined from actual memory use. 
This
  * size is larger than the actual data size since this size includes 
per-
  * column overhead such as any unused vector space, etc.
  */
 
-public int estSize;
-public int capacity;
-public int density;
-public int dataSize;
-public boolean variableWidth;
-
-public ColumnSize(ValueVector vv) {
-  metadata = vv.getField();
-  stdSize = TypeHelper.getSize(metadata.getType());
-
-  // Can't get size estimates if this is an empty batch.
-  int rowCount = vv.getAccessor().getValueCount();
-  if (rowCount == 0) {
-estSize = stdSize;
-return;
-  }
+public final int estSize;
+public final int valueCount;
+public final int entryCount;
+public final int dataSize;
+public final int estElementCount;
+public final boolean isVariableWidth;
 
-  // Total size taken by all vectors (and underlying buffers)
-  // associated with this vector.
+public ColumnSize(ValueVector v, String prefix, int valueCount) {
+  this.prefix = prefix;
+  this.valueCount = valueCount;
+  metadata = v.getField();
+  boolean isMap = v.getField().getType().getMinorType() == 
MinorType.MAP;
 
-  totalSize = vv.getAllocatedByteCount();
+  // The amount of memory consumed by the payload: the actual
+  // data stored in the vectors.
 
-  // Capacity is the number of values that the vector could
-  // contain. This is useful only for fixed-length vectors.
+  int mapSize = 0;
+  if (v.getField().getDataMode() == DataMode.REPEATED) {
 
-  capacity = vv.getValueCapacity();
+// Repeated vectors are special: they have an associated offset 
vector
+// that changes the value count of the contained vectors.
 
-  // The amount of memory consumed by the payload: the actual
-  // data stored in the vectors.
+@SuppressWarnings("resource")
+UInt4Vector offsetVector = ((RepeatedValueVector) 
v).getOffsetVector();
+entryCount = offsetVector.getAccessor().get(valueCount);
+estElementCount = roundUp(entryCount, valueCount);
+if (isMap) {
 
-  dataSize = vv.getPayloadByteCount();
+  // For map, the only data associated with the map vector
+  // itself is 

[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124435170
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/SortMemoryManager.java
 ---
@@ -312,52 +488,66 @@ private void adjustForLowMemory() {
* one spill batch to make progress.
*/
 
-  private void lowMemorySpillBatchSize() {
+  private void lowMemoryInternalBatchSizes() {
 
 // The "expected" size is with power-of-two rounding in some vectors.
 // We later work backwards to the row count assuming average internal
 // fragmentation.
 
-// Must hold two input batches. Use (most of) the rest for the spill 
batch.
+// Must hold two input batches. Use half of the rest for the spill 
batch.
+// In a really bad case, the number here may be negative. We'll fix
+// it below.
 
-expectedSpillBatchSize = (int) (memoryLimit - 2 * estimatedInputSize);
+int spillBufferSize = (int) (memoryLimit - 2 * 
inputBatchSize.maxBufferSize) / 2;
--- End diff --

(1) spilled "buffer" or spilled "batch" size ?
(2) Where is the possible negative number fixed ? Looks like 
setFromWorstCase() would set all three fields to negative numbers. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r123838906
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
 ---
@@ -19,117 +19,162 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;
 
+import org.apache.drill.common.types.TypeProtos.DataMode;
 import org.apache.drill.common.types.TypeProtos.MinorType;
 import org.apache.drill.exec.expr.TypeHelper;
+import org.apache.drill.exec.memory.AllocationManager.BufferLedger;
 import org.apache.drill.exec.memory.BaseAllocator;
 import org.apache.drill.exec.record.BatchSchema;
 import org.apache.drill.exec.record.MaterializedField;
 import org.apache.drill.exec.record.RecordBatch;
+import org.apache.drill.exec.record.SmartAllocationHelper;
 import org.apache.drill.exec.record.VectorAccessible;
 import org.apache.drill.exec.record.VectorWrapper;
 import org.apache.drill.exec.record.selection.SelectionVector2;
+import org.apache.drill.exec.vector.UInt4Vector;
 import org.apache.drill.exec.vector.ValueVector;
 import org.apache.drill.exec.vector.complex.AbstractMapVector;
+import org.apache.drill.exec.vector.complex.RepeatedValueVector;
 import org.apache.drill.exec.vector.VariableWidthVector;
 
+import com.google.common.collect.Sets;
+
 /**
  * Given a record batch or vector container, determines the actual memory
  * consumed by each column, the average row, and the entire record batch.
  */
 
 public class RecordBatchSizer {
-//  private static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(RecordBatchSizer.class);
 
   /**
* Column size information.
*/
   public static class ColumnSize {
+public final String prefix;
 public final MaterializedField metadata;
 
 /**
- * Assumed size from Drill metadata.
+ * Assumed size from Drill metadata. Note that this information is
+ * 100% bogus. Do not use it.
  */
 
+@Deprecated
 public int stdSize;
 
 /**
- * Actual memory consumed by all the vectors associated with this 
column.
- */
-
-public int totalSize;
-
-/**
  * Actual average column width as determined from actual memory use. 
This
  * size is larger than the actual data size since this size includes 
per-
  * column overhead such as any unused vector space, etc.
  */
 
-public int estSize;
-public int capacity;
-public int density;
-public int dataSize;
-public boolean variableWidth;
-
-public ColumnSize(ValueVector vv) {
-  metadata = vv.getField();
-  stdSize = TypeHelper.getSize(metadata.getType());
-
-  // Can't get size estimates if this is an empty batch.
-  int rowCount = vv.getAccessor().getValueCount();
-  if (rowCount == 0) {
-estSize = stdSize;
-return;
-  }
+public final int estSize;
+public final int valueCount;
+public final int entryCount;
+public final int dataSize;
+public final int estElementCount;
+public final boolean isVariableWidth;
 
-  // Total size taken by all vectors (and underlying buffers)
-  // associated with this vector.
+public ColumnSize(ValueVector v, String prefix, int valueCount) {
+  this.prefix = prefix;
+  this.valueCount = valueCount;
+  metadata = v.getField();
+  boolean isMap = v.getField().getType().getMinorType() == 
MinorType.MAP;
 
-  totalSize = vv.getAllocatedByteCount();
+  // The amount of memory consumed by the payload: the actual
+  // data stored in the vectors.
 
-  // Capacity is the number of values that the vector could
-  // contain. This is useful only for fixed-length vectors.
+  int mapSize = 0;
+  if (v.getField().getDataMode() == DataMode.REPEATED) {
 
-  capacity = vv.getValueCapacity();
+// Repeated vectors are special: they have an associated offset 
vector
+// that changes the value count of the contained vectors.
 
-  // The amount of memory consumed by the payload: the actual
-  // data stored in the vectors.
+@SuppressWarnings("resource")
+UInt4Vector offsetVector = ((RepeatedValueVector) 
v).getOffsetVector();
+entryCount = offsetVector.getAccessor().get(valueCount);
+estElementCount = roundUp(entryCount, valueCount);
+if (isMap) {
 
-  dataSize = vv.getPayloadByteCount();
+  // For map, the only data associated with the map vector
+  // itself is 

[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127345763
  
--- Diff: 
exec/vector/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java
 ---
@@ -435,20 +438,18 @@ public void copyEntry(int toIndex, ValueVector from, 
int fromIndex) {
 copyFromSafe(fromIndex, toIndex, (RepeatedListVector) from);
   }
 
-  @Override
-  public int getAllocatedByteCount() {
-return delegate.getAllocatedByteCount();
+  public void getLedgers(Set ledgers) {
--- End diff --

again 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124437209
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/SortMemoryManager.java
 ---
@@ -97,24 +215,43 @@
 
   private SortConfig config;
 
-  private int estimatedInputSize;
-
   private boolean potentialOverflow;
 
-  public SortMemoryManager(SortConfig config, long memoryLimit) {
+  private boolean isLowMemory;
+
+  private boolean performanceWarning;
+
+  public SortMemoryManager(SortConfig config, long opMemoryLimit) {
 this.config = config;
 
 // The maximum memory this operator can use as set by the
 // operator definition (propagated to the allocator.)
 
 if (config.maxMemory() > 0) {
--- End diff --

Since opMemoryLimit > 0, it can be put in the if() condition instead of 0, 
hence no need for the Math.min() . (Also the whole statement can be written in 
a single code line using ?:  )



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124106839
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
 ---
@@ -189,30 +238,29 @@ public RecordBatchSizer(VectorAccessible va) {
   public RecordBatchSizer(VectorAccessible va, SelectionVector2 sv2) {
 rowCount = va.getRecordCount();
 for (VectorWrapper vw : va) {
-  int size = measureColumn(vw.getValueVector());
-  if ( size > maxSize ) { maxSize = size; }
-  if ( vw.getField().isNullable() ) { numNullables++; }
+  measureColumn(vw.getValueVector(), "", rowCount);
+}
+
+for (BufferLedger ledger : ledgers) {
+  accountedMemorySize += ledger.getAccountedSize();
 }
 
 if (rowCount > 0) {
-  grossRowWidth = roundUp(totalBatchSize, rowCount);
+  grossRowWidth = roundUp(accountedMemorySize, rowCount);
 }
 
 if (sv2 != null) {
   sv2Size = sv2.getBuffer(false).capacity();
-  grossRowWidth += roundUp(sv2Size, rowCount);
-  netRowWidth += 2;
+  accountedMemorySize += sv2Size;
 }
 
-int totalDensity = 0;
-int usableCount = 0;
-for (ColumnSize colSize : columnSizes) {
-  if ( colSize.density > 0 ) {
-usableCount++;
-  }
-  totalDensity += colSize.density;
-}
-avgDensity = roundUp(totalDensity, usableCount);
+computeEstimates();
+  }
+
+  private void computeEstimates() {
+grossRowWidth = roundUp(accountedMemorySize, rowCount);
+netRowWidth = roundUp(netBatchSize, rowCount);
+avgDensity = roundUp(netBatchSize * 100, accountedMemorySize);
   }
 
   public void applySv2() {
--- End diff --

hasSv2 is never set, hence always 'false'. So when this method is called 
(from analyzeIncomingBatch() ), it increases the accountedMemorySize even 
though there is no SV2 !!



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124121514
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/MSortTemplate.java
 ---
@@ -92,8 +93,9 @@ public void setup(final FragmentContext context, final 
BufferAllocator allocator
* @return
*/
   public static long memoryNeeded(final int recordCount) {
-// We need 4 bytes (SV4) for each record.
-return recordCount * 4;
+// We need 4 bytes (SV4) for each record, power of 2 rounded.
--- End diff --

Does the implementation of __memoryNeeded()__ in 
SortRecordBatchBuilder.java also need to be similarly modified ?
  And a minor comment: Would the name __memoryNeededForSV4()__ be more 
descriptive ?  


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r123844716
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
 ---
@@ -19,117 +19,162 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;
 
+import org.apache.drill.common.types.TypeProtos.DataMode;
 import org.apache.drill.common.types.TypeProtos.MinorType;
 import org.apache.drill.exec.expr.TypeHelper;
+import org.apache.drill.exec.memory.AllocationManager.BufferLedger;
 import org.apache.drill.exec.memory.BaseAllocator;
 import org.apache.drill.exec.record.BatchSchema;
 import org.apache.drill.exec.record.MaterializedField;
 import org.apache.drill.exec.record.RecordBatch;
+import org.apache.drill.exec.record.SmartAllocationHelper;
 import org.apache.drill.exec.record.VectorAccessible;
 import org.apache.drill.exec.record.VectorWrapper;
 import org.apache.drill.exec.record.selection.SelectionVector2;
+import org.apache.drill.exec.vector.UInt4Vector;
 import org.apache.drill.exec.vector.ValueVector;
 import org.apache.drill.exec.vector.complex.AbstractMapVector;
+import org.apache.drill.exec.vector.complex.RepeatedValueVector;
 import org.apache.drill.exec.vector.VariableWidthVector;
 
+import com.google.common.collect.Sets;
+
 /**
  * Given a record batch or vector container, determines the actual memory
  * consumed by each column, the average row, and the entire record batch.
  */
 
 public class RecordBatchSizer {
-//  private static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(RecordBatchSizer.class);
 
   /**
* Column size information.
*/
   public static class ColumnSize {
+public final String prefix;
 public final MaterializedField metadata;
 
 /**
- * Assumed size from Drill metadata.
+ * Assumed size from Drill metadata. Note that this information is
+ * 100% bogus. Do not use it.
  */
 
+@Deprecated
 public int stdSize;
 
 /**
- * Actual memory consumed by all the vectors associated with this 
column.
- */
-
-public int totalSize;
-
-/**
  * Actual average column width as determined from actual memory use. 
This
  * size is larger than the actual data size since this size includes 
per-
  * column overhead such as any unused vector space, etc.
  */
 
-public int estSize;
-public int capacity;
-public int density;
-public int dataSize;
-public boolean variableWidth;
-
-public ColumnSize(ValueVector vv) {
-  metadata = vv.getField();
-  stdSize = TypeHelper.getSize(metadata.getType());
-
-  // Can't get size estimates if this is an empty batch.
-  int rowCount = vv.getAccessor().getValueCount();
-  if (rowCount == 0) {
-estSize = stdSize;
-return;
-  }
+public final int estSize;
+public final int valueCount;
+public final int entryCount;
+public final int dataSize;
+public final int estElementCount;
+public final boolean isVariableWidth;
 
-  // Total size taken by all vectors (and underlying buffers)
-  // associated with this vector.
+public ColumnSize(ValueVector v, String prefix, int valueCount) {
+  this.prefix = prefix;
+  this.valueCount = valueCount;
+  metadata = v.getField();
+  boolean isMap = v.getField().getType().getMinorType() == 
MinorType.MAP;
 
-  totalSize = vv.getAllocatedByteCount();
+  // The amount of memory consumed by the payload: the actual
+  // data stored in the vectors.
 
-  // Capacity is the number of values that the vector could
-  // contain. This is useful only for fixed-length vectors.
+  int mapSize = 0;
+  if (v.getField().getDataMode() == DataMode.REPEATED) {
 
-  capacity = vv.getValueCapacity();
+// Repeated vectors are special: they have an associated offset 
vector
+// that changes the value count of the contained vectors.
 
-  // The amount of memory consumed by the payload: the actual
-  // data stored in the vectors.
+@SuppressWarnings("resource")
+UInt4Vector offsetVector = ((RepeatedValueVector) 
v).getOffsetVector();
+entryCount = offsetVector.getAccessor().get(valueCount);
+estElementCount = roundUp(entryCount, valueCount);
+if (isMap) {
 
-  dataSize = vv.getPayloadByteCount();
+  // For map, the only data associated with the map vector
+  // itself is 

[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124428100
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/SortMemoryManager.java
 ---
@@ -19,7 +19,125 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
+/**
+ * Computes the memory needs for input batches, spill batches and merge
+ * batches. The key challenges that this code tries to overcome are:
+ * 
+ * Drill is not designed for the small memory allocations,
+ * but the planner may provide such allocations because the memory per
+ * query is divided among slices (minor fragments) and among buffering
+ * operators, leaving very little per operator.
+ * Drill does not provide the detailed memory information needed to
+ * carefully manage memory in tight constraints.
+ * But, Drill has a death penalty for going over the memory limit.
+ * 
+ * As a result, this class is a bit of a hack: it attempt to consider a
+ * number of ill-defined factors in order to divide up memory use in a
+ * way that prevents OOM errors.
+ * 
+ * First, it is necessary to differentiate two concepts:
+ * 
+ * The data size of a batch: the amount of memory needed to hold
+ * the data itself. The data size is constant for any given batch.
+ * The buffer size of the buffers that hold the data. The buffer
+ * size varies wildly depending on how the batch was produced.
+ * 
+ * The three kinds of buffer layouts seen to date include:
+ * 
+ * One buffer per vector component (data, offsets, null flags, etc.)
+ *  create by readers, project and other operators.
+ * One buffer for the entire batch, with each vector component using
+ * a slice of the overall buffer.  case for batches deserialized 
from
+ * exchanges.
+ * One buffer for each top-level vector, with component vectors
+ * using slices of the overall vector buffer  the result of reading
+ * spilled batches from disk.
+ * 
+ * In each case, buffer sizes are power-of-two rounded from the data size.
+ * But since the data is grouped differently in each case, the resulting 
buffer
+ * sizes vary considerably.
+ * 
+ * As a result, we can never be sure of the amount of memory needed for a
+ * batch. So, we have to estimate based on a number of factors:
+ * 
+ * Uses the {@link RecordBatchSizer} to estimate the data size and
+ * buffer size of each incoming batch.
+ * Estimates the internal fragmentation due to power-of-two 
rounding.
+ * Configured preferences for spill and output batches.
+ * 
+ * The code handles "normal" and "low" memory conditions.
+ * 
+ * In normal memory, we simply work out the number of preferred-size
+ * batches fit in memory (based on the predicted buffer size.)
+ * In low memory, we divide up the available memory to produce the
+ * spill and merge batch sizes. The sizes will be less than the configured
--- End diff --

Nice explanation up to here; but need more reasoning/explanation about the 
"low memory" - what's different here ? What's the purpose of the configured 
limits ? Why sizes are smaller (won't that yield a higher number of batches, 
hence may OOM ?)


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124436676
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/SortMemoryManager.java
 ---
@@ -19,7 +19,125 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
+/**
+ * Computes the memory needs for input batches, spill batches and merge
+ * batches. The key challenges that this code tries to overcome are:
+ * 
+ * Drill is not designed for the small memory allocations,
+ * but the planner may provide such allocations because the memory per
+ * query is divided among slices (minor fragments) and among buffering
+ * operators, leaving very little per operator.
+ * Drill does not provide the detailed memory information needed to
+ * carefully manage memory in tight constraints.
+ * But, Drill has a death penalty for going over the memory limit.
+ * 
+ * As a result, this class is a bit of a hack: it attempt to consider a
+ * number of ill-defined factors in order to divide up memory use in a
+ * way that prevents OOM errors.
+ * 
+ * First, it is necessary to differentiate two concepts:
+ * 
+ * The data size of a batch: the amount of memory needed to hold
+ * the data itself. The data size is constant for any given batch.
+ * The buffer size of the buffers that hold the data. The buffer
+ * size varies wildly depending on how the batch was produced.
+ * 
+ * The three kinds of buffer layouts seen to date include:
+ * 
+ * One buffer per vector component (data, offsets, null flags, etc.)
+ *  create by readers, project and other operators.
+ * One buffer for the entire batch, with each vector component using
+ * a slice of the overall buffer.  case for batches deserialized 
from
+ * exchanges.
+ * One buffer for each top-level vector, with component vectors
+ * using slices of the overall vector buffer  the result of reading
+ * spilled batches from disk.
+ * 
+ * In each case, buffer sizes are power-of-two rounded from the data size.
+ * But since the data is grouped differently in each case, the resulting 
buffer
+ * sizes vary considerably.
+ * 
+ * As a result, we can never be sure of the amount of memory needed for a
+ * batch. So, we have to estimate based on a number of factors:
+ * 
+ * Uses the {@link RecordBatchSizer} to estimate the data size and
+ * buffer size of each incoming batch.
+ * Estimates the internal fragmentation due to power-of-two 
rounding.
+ * Configured preferences for spill and output batches.
+ * 
+ * The code handles "normal" and "low" memory conditions.
+ * 
+ * In normal memory, we simply work out the number of preferred-size
+ * batches fit in memory (based on the predicted buffer size.)
+ * In low memory, we divide up the available memory to produce the
+ * spill and merge batch sizes. The sizes will be less than the configured
+ * preference.
+ * 
+ */
+
 public class SortMemoryManager {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(ExternalSortBatch.class);
+
+  /**
+   * Estimate for typical internal fragmentation in a buffer due to 
power-of-two
+   * rounding on vectors.
+   * 
+   * 
+   * [|__$__]
+   * In the above, the brackets represent the whole vector. The
+   * first half is always full. When the first half filled, the second
+   * half was allocated. On average, the second half will be half full.
+   * This means that, on average, 1/4 of the allocated space is
+   * unused (the definition of internal fragmentation.)
+   */
+
+  public static final double INTERNAL_FRAGMENTATION_ESTIMATE = 1.0/4.0;
+
+  /**
+   * Given a buffer, this is the assumed amount of space
+   * available for data. (Adding more will double the buffer
+   * size half the time.)
+   */
+
+  public static final double PAYLOAD_MULTIPLIER = 1 - 
INTERNAL_FRAGMENTATION_ESTIMATE;
+
+  /**
+   * Given a data size, this is the multiplier to create the buffer
+   * size estimate. (Note: since we work with aggregate batches, we
+   * cannot simply round up to the next power of two: rounding is done
+   * on a vector-by-vector basis. Here we need to estimate the aggregate
+   * effect of rounding.
+   */
+
+  public static final double BUFFER_MULTIPLIER = 1/PAYLOAD_MULTIPLIER;
+  public static final double WORST_CASE_MULTIPLIER = 2.0;
+
+  public static final int MIN_ROWS_PER_SORT_BATCH = 100;
+  public static final double LOW_MEMORY_MERGE_BATCH_RATIO = 0.25;
+
+  public static class BatchSizeEstimate {
+int dataSize;
+int expectedBufferSize;
+int maxBufferSize;
+

[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124122190
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/MergeSortWrapper.java
 ---
@@ -147,7 +147,7 @@ private MSorter createNewMSorter(List 
orderings, MappingSet mainMappin
 cg.plainJavaCapable(true);
 
 // Uncomment out this line to debug the generated code.
-//  cg.saveCodeForDebugging(true);
+cg.saveCodeForDebugging(true);
--- End diff --

Need to restore the previous comment; i.e. no change for this file 



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r127061135
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/record/SmartAllocationHelper.java
 ---
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.record;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.drill.exec.vector.AllocationHelper;
+import org.apache.drill.exec.vector.ValueVector;
+import org.apache.drill.exec.vector.complex.AbstractMapVector;
+import org.apache.drill.exec.vector.complex.RepeatedMapVector;
+
+/**
+ * Prototype mechanism to allocate vectors based on expected
+ * data sizes. This version uses a name-based map of fields
+ * to sizes. Better to represent the batch structurally and
+ * simply iterate over the schema rather than doing a per-field
+ * lookup. But, the mechanisms needed to do the efficient solution
+ * don't exist yet.
+ */
+
+public class SmartAllocationHelper {
+
+  public static class AllocationHint {
--- End diff --

Should the class AllocationHint be made private ? It is only used 
internally by SmartAllocationHelper . 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r124122920
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/xsort/managed/PriorityQueueCopierWrapper.java
 ---
@@ -82,7 +84,7 @@ private PriorityQueueCopier newCopier(VectorAccessible 
batch) {
 ClassGenerator g = cg.getRoot();
 cg.plainJavaCapable(true);
 // Uncomment out this line to debug the generated code.
-//cg.saveCodeForDebugging(true);
+cg.saveCodeForDebugging(true);
--- End diff --

ditto ... re-comment ...


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r123814368
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
 ---
@@ -19,117 +19,162 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;
 
+import org.apache.drill.common.types.TypeProtos.DataMode;
 import org.apache.drill.common.types.TypeProtos.MinorType;
 import org.apache.drill.exec.expr.TypeHelper;
+import org.apache.drill.exec.memory.AllocationManager.BufferLedger;
 import org.apache.drill.exec.memory.BaseAllocator;
 import org.apache.drill.exec.record.BatchSchema;
 import org.apache.drill.exec.record.MaterializedField;
 import org.apache.drill.exec.record.RecordBatch;
+import org.apache.drill.exec.record.SmartAllocationHelper;
 import org.apache.drill.exec.record.VectorAccessible;
 import org.apache.drill.exec.record.VectorWrapper;
 import org.apache.drill.exec.record.selection.SelectionVector2;
+import org.apache.drill.exec.vector.UInt4Vector;
 import org.apache.drill.exec.vector.ValueVector;
 import org.apache.drill.exec.vector.complex.AbstractMapVector;
+import org.apache.drill.exec.vector.complex.RepeatedValueVector;
 import org.apache.drill.exec.vector.VariableWidthVector;
 
+import com.google.common.collect.Sets;
+
 /**
  * Given a record batch or vector container, determines the actual memory
  * consumed by each column, the average row, and the entire record batch.
  */
 
 public class RecordBatchSizer {
-//  private static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(RecordBatchSizer.class);
 
   /**
* Column size information.
*/
   public static class ColumnSize {
+public final String prefix;
 public final MaterializedField metadata;
 
 /**
- * Assumed size from Drill metadata.
+ * Assumed size from Drill metadata. Note that this information is
+ * 100% bogus. Do not use it.
  */
 
+@Deprecated
 public int stdSize;
 
 /**
- * Actual memory consumed by all the vectors associated with this 
column.
- */
-
-public int totalSize;
-
-/**
  * Actual average column width as determined from actual memory use. 
This
  * size is larger than the actual data size since this size includes 
per-
  * column overhead such as any unused vector space, etc.
  */
 
-public int estSize;
-public int capacity;
-public int density;
-public int dataSize;
-public boolean variableWidth;
-
-public ColumnSize(ValueVector vv) {
-  metadata = vv.getField();
-  stdSize = TypeHelper.getSize(metadata.getType());
-
-  // Can't get size estimates if this is an empty batch.
-  int rowCount = vv.getAccessor().getValueCount();
-  if (rowCount == 0) {
-estSize = stdSize;
-return;
-  }
+public final int estSize;
+public final int valueCount;
+public final int entryCount;
+public final int dataSize;
+public final int estElementCount;
+public final boolean isVariableWidth;
 
-  // Total size taken by all vectors (and underlying buffers)
-  // associated with this vector.
+public ColumnSize(ValueVector v, String prefix, int valueCount) {
--- End diff --

Programming 101: "v" - The use of a single letter variable name should be 
discouraged ...


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r123845357
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
 ---
@@ -177,7 +226,7 @@ public RecordBatchSizer(RecordBatch batch) {
   /**
*  Count the nullable columns; used for memory estimation
*/
-  public int numNullables;
+  public int nullableCount;
   /**
*
* @param va
--- End diff --

Update this javadoc 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #860: DRILL-5601: Rollup of external sort fixes and impro...

2017-07-13 Thread Ben-Zvi
Github user Ben-Zvi commented on a diff in the pull request:

https://github.com/apache/drill/pull/860#discussion_r123839714
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/spill/RecordBatchSizer.java
 ---
@@ -139,14 +184,16 @@ public String toString() {
*/
   private int rowCount;
   /**
-   * Standard row width using Drill meta-data.
+   * Standard row width using Drill meta-data. Note: this information is
+   * 100% bogus. Do not use it.
*/
+  @Deprecated
   private int stdRowWidth;
--- End diff --

Can just eliminate this field (and the method stdRowWidth() ). It is only 
being used by the HashAgg, but that code there is dead anyway, as it only 
applies for rowCount() == 0 , which is false as the method 
updateEstMaxBatchSize() is only called for a non-empty batch.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127324650
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -1851,9 +1860,81 @@ private static String relativize(String baseDir, 
String childPath) {
   .relativize(fullPathWithoutSchemeAndAuthority.toUri()));
   if (relativeFilePath.isAbsolute()) {
 throw new IllegalStateException(String.format("Path %s is not a 
subpath of %s.",
-basePathWithoutSchemeAndAuthority.toUri().toString(), 
fullPathWithoutSchemeAndAuthority.toUri().toString()));
+basePathWithoutSchemeAndAuthority.toUri().getPath(), 
fullPathWithoutSchemeAndAuthority.toUri().getPath()));
+  }
+  return relativeFilePath.toUri().getPath();
+}
+  }
+
+  /**
+   * Used to identify metadata version by the deserialization 
"metadata_version" first property
+   * from the metadata cache file
+   */
+  public static class MetadataVersion {
+@JsonProperty("metadata_version")
+public String textVersion;
+
+/**
+ * Supported metadata versions.
+ * Note: keep them synchronized with {@link ParquetTableMetadataBase} 
versions
+ */
+enum Versions {
+  v1(Constants.V1),
+  v2(Constants.V2),
+  v3(Constants.V3),
+  v3_1(Constants.V3_1);
+
+  private final String version;
+
+  Versions(String version) {
+this.version = version;
+  }
+
+  public String getVersion() {
+return version;
+  }
+
+  public static Versions fromString(String version) {
+for (Versions v : Versions.values()) {
+  if (v.version.equalsIgnoreCase(version)) {
--- End diff --

Can be replaced with `v.getName().equalsIgnoreCase(version)`?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127324951
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -139,13 +139,13 @@ public static ParquetTableMetadata_v3 
getParquetTableMetadata(FileSystem fs,
* @return
* @throws IOException
*/
-  public static ParquetTableMetadataBase readBlockMeta(FileSystem fs, 
String path, MetadataContext metaContext, ParquetFormatConfig formatConfig) 
throws IOException {
+  public static ParquetTableMetadataBase readBlockMeta(FileSystem fs, Path 
path, MetadataContext metaContext, ParquetFormatConfig formatConfig) throws 
IOException {
--- End diff --

Please update java doc.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127324232
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -557,7 +556,7 @@ private void readBlockMeta(String path,
 mapper.registerModule(serialModule);
 mapper.registerModule(module);
 mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, 
false);
-FSDataInputStream is = fs.open(p);
+FSDataInputStream is = fs.open(path);
--- End diff --

How the stream will be closed?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127324972
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -537,13 +537,12 @@ private void writeFile(ParquetTableMetadataDirs 
parquetTableMetadataDirs, Path p
* @return
* @throws IOException
*/
-  private void readBlockMeta(String path,
+  private void readBlockMeta(Path path,
--- End diff --

Please update java doc.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127320876
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -1851,9 +1860,81 @@ private static String relativize(String baseDir, 
String childPath) {
   .relativize(fullPathWithoutSchemeAndAuthority.toUri()));
   if (relativeFilePath.isAbsolute()) {
 throw new IllegalStateException(String.format("Path %s is not a 
subpath of %s.",
-basePathWithoutSchemeAndAuthority.toUri().toString(), 
fullPathWithoutSchemeAndAuthority.toUri().toString()));
+basePathWithoutSchemeAndAuthority.toUri().getPath(), 
fullPathWithoutSchemeAndAuthority.toUri().getPath()));
+  }
+  return relativeFilePath.toUri().getPath();
+}
+  }
+
+  /**
+   * Used to identify metadata version by the deserialization 
"metadata_version" first property
+   * from the metadata cache file
+   */
+  public static class MetadataVersion {
+@JsonProperty("metadata_version")
+public String textVersion;
+
+/**
+ * Supported metadata versions.
+ * Note: keep them synchronized with {@link ParquetTableMetadataBase} 
versions
+ */
+enum Versions {
+  v1(Constants.V1),
+  v2(Constants.V2),
+  v3(Constants.V3),
+  v3_1(Constants.V3_1);
+
+  private final String version;
+
+  Versions(String version) {
+this.version = version;
+  }
+
+  public String getVersion() {
+return version;
+  }
+
+  public static Versions fromString(String version) {
+for (Versions v : Versions.values()) {
+  if (v.version.equalsIgnoreCase(version)) {
+return v;
+  }
+}
+return null;
+  }
+
+  public static class Constants {
+public static final String V1 = "v1";
+public static final String V2 = "v2";
+public static final String V3 = "v3";
+public static final String V3_1 = "v3_1";
+  }
+}
+
+/**
+ * @param fs current file system
+ * @param path of metadata cache file
+ * @return true if metadata version is supported, false otherwise
+ * @throws IOException if parquet metadata can't be deserialized from 
the json file
+ */
+public static boolean isVersionSupported(FileSystem fs, Path path) 
throws IOException {
+  ObjectMapper mapper = new ObjectMapper();
+  mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, 
false);
+  FSDataInputStream is = fs.open(path);
+
+  MetadataVersion metadataVersion = mapper.readValue(is, 
MetadataVersion.class);
+  Versions version = Versions.fromString(metadataVersion.textVersion);
+  if (!(version == null)) {
--- End diff --

`version != null`?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127321261
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -1851,9 +1860,81 @@ private static String relativize(String baseDir, 
String childPath) {
   .relativize(fullPathWithoutSchemeAndAuthority.toUri()));
   if (relativeFilePath.isAbsolute()) {
 throw new IllegalStateException(String.format("Path %s is not a 
subpath of %s.",
-basePathWithoutSchemeAndAuthority.toUri().toString(), 
fullPathWithoutSchemeAndAuthority.toUri().toString()));
+basePathWithoutSchemeAndAuthority.toUri().getPath(), 
fullPathWithoutSchemeAndAuthority.toUri().getPath()));
+  }
+  return relativeFilePath.toUri().getPath();
+}
+  }
+
+  /**
+   * Used to identify metadata version by the deserialization 
"metadata_version" first property
+   * from the metadata cache file
+   */
+  public static class MetadataVersion {
+@JsonProperty("metadata_version")
+public String textVersion;
+
+/**
+ * Supported metadata versions.
+ * Note: keep them synchronized with {@link ParquetTableMetadataBase} 
versions
+ */
+enum Versions {
+  v1(Constants.V1),
+  v2(Constants.V2),
+  v3(Constants.V3),
+  v3_1(Constants.V3_1);
+
+  private final String version;
+
+  Versions(String version) {
+this.version = version;
+  }
+
+  public String getVersion() {
+return version;
+  }
+
+  public static Versions fromString(String version) {
+for (Versions v : Versions.values()) {
+  if (v.version.equalsIgnoreCase(version)) {
+return v;
+  }
+}
+return null;
+  }
+
+  public static class Constants {
+public static final String V1 = "v1";
+public static final String V2 = "v2";
+public static final String V3 = "v3";
+public static final String V3_1 = "v3_1";
+  }
+}
+
+/**
+ * @param fs current file system
+ * @param path of metadata cache file
+ * @return true if metadata version is supported, false otherwise
+ * @throws IOException if parquet metadata can't be deserialized from 
the json file
+ */
+public static boolean isVersionSupported(FileSystem fs, Path path) 
throws IOException {
+  ObjectMapper mapper = new ObjectMapper();
+  mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, 
false);
+  FSDataInputStream is = fs.open(path);
--- End diff --

How the steam will be closed?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127322815
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -658,11 +657,21 @@ private boolean tableModified(List 
directories, Path metaFilePath,
 return false;
   }
 
+  /**
+   * Basic class for parquet metadata. Inheritors of this class are json 
serializable structures of
+   * different versions metadata cache files.
+   *
+   * Bump up metadata major version if metadata structure is changed.
+   * Bump up metadata minor version if only metadata content is changed, 
but metadata structure is the same.
+   *
+   * Note: keep metadata versions synchronized with {@link 
MetadataVersion.Versions}
+   */
   @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = 
JsonTypeInfo.As.PROPERTY, property = "metadata_version")
   @JsonSubTypes({
-  @JsonSubTypes.Type(value = ParquetTableMetadata_v1.class, name="v1"),
-  @JsonSubTypes.Type(value = ParquetTableMetadata_v2.class, name="v2"),
-  @JsonSubTypes.Type(value = ParquetTableMetadata_v3.class, name="v3")
+  @JsonSubTypes.Type(value = ParquetTableMetadata_v1.class, name = 
MetadataVersion.Versions.Constants.V1),
--- End diff --

Can be replaced with `Version.V1.getName()`?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127323200
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -1851,9 +1860,81 @@ private static String relativize(String baseDir, 
String childPath) {
   .relativize(fullPathWithoutSchemeAndAuthority.toUri()));
   if (relativeFilePath.isAbsolute()) {
 throw new IllegalStateException(String.format("Path %s is not a 
subpath of %s.",
-basePathWithoutSchemeAndAuthority.toUri().toString(), 
fullPathWithoutSchemeAndAuthority.toUri().toString()));
+basePathWithoutSchemeAndAuthority.toUri().getPath(), 
fullPathWithoutSchemeAndAuthority.toUri().getPath()));
+  }
+  return relativeFilePath.toUri().getPath();
+}
+  }
+
+  /**
+   * Used to identify metadata version by the deserialization 
"metadata_version" first property
+   * from the metadata cache file
+   */
+  public static class MetadataVersion {
+@JsonProperty("metadata_version")
+public String textVersion;
+
+/**
+ * Supported metadata versions.
+ * Note: keep them synchronized with {@link ParquetTableMetadataBase} 
versions
+ */
+enum Versions {
+  v1(Constants.V1),
+  v2(Constants.V2),
+  v3(Constants.V3),
+  v3_1(Constants.V3_1);
+
+  private final String version;
+
+  Versions(String version) {
+this.version = version;
+  }
+
+  public String getVersion() {
+return version;
+  }
+
+  public static Versions fromString(String version) {
+for (Versions v : Versions.values()) {
+  if (v.version.equalsIgnoreCase(version)) {
+return v;
+  }
+}
+return null;
+  }
+
+  public static class Constants {
+public static final String V1 = "v1";
+public static final String V2 = "v2";
+public static final String V3 = "v3";
+public static final String V3_1 = "v3_1";
+  }
+}
+
+/**
--- End diff --

Please add brief method description.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127321352
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -1851,9 +1860,81 @@ private static String relativize(String baseDir, 
String childPath) {
   .relativize(fullPathWithoutSchemeAndAuthority.toUri()));
   if (relativeFilePath.isAbsolute()) {
 throw new IllegalStateException(String.format("Path %s is not a 
subpath of %s.",
-basePathWithoutSchemeAndAuthority.toUri().toString(), 
fullPathWithoutSchemeAndAuthority.toUri().toString()));
+basePathWithoutSchemeAndAuthority.toUri().getPath(), 
fullPathWithoutSchemeAndAuthority.toUri().getPath()));
+  }
+  return relativeFilePath.toUri().getPath();
+}
+  }
+
+  /**
+   * Used to identify metadata version by the deserialization 
"metadata_version" first property
+   * from the metadata cache file
+   */
+  public static class MetadataVersion {
+@JsonProperty("metadata_version")
+public String textVersion;
+
+/**
+ * Supported metadata versions.
+ * Note: keep them synchronized with {@link ParquetTableMetadataBase} 
versions
+ */
+enum Versions {
+  v1(Constants.V1),
+  v2(Constants.V2),
+  v3(Constants.V3),
+  v3_1(Constants.V3_1);
+
+  private final String version;
+
+  Versions(String version) {
+this.version = version;
+  }
+
+  public String getVersion() {
+return version;
+  }
+
+  public static Versions fromString(String version) {
--- End diff --

Please add Java doc.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127322298
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -1851,9 +1860,81 @@ private static String relativize(String baseDir, 
String childPath) {
   .relativize(fullPathWithoutSchemeAndAuthority.toUri()));
   if (relativeFilePath.isAbsolute()) {
 throw new IllegalStateException(String.format("Path %s is not a 
subpath of %s.",
-basePathWithoutSchemeAndAuthority.toUri().toString(), 
fullPathWithoutSchemeAndAuthority.toUri().toString()));
+basePathWithoutSchemeAndAuthority.toUri().getPath(), 
fullPathWithoutSchemeAndAuthority.toUri().getPath()));
+  }
+  return relativeFilePath.toUri().getPath();
+}
+  }
+
+  /**
+   * Used to identify metadata version by the deserialization 
"metadata_version" first property
+   * from the metadata cache file
+   */
+  public static class MetadataVersion {
+@JsonProperty("metadata_version")
+public String textVersion;
+
+/**
+ * Supported metadata versions.
+ * Note: keep them synchronized with {@link ParquetTableMetadataBase} 
versions
+ */
+enum Versions {
--- End diff --

It would be good to follow Enum naming convention [1]:
1. Enum naming is usually singular.
2. Enum types are usually in upper case.
3. There is no need to use Constants class. Enum types usually represents a 
fixed set of constants. You can use `getName()` to get String value.

[1] http://docs.oracle.com/javase/tutorial/java/javaOO/enum.html


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/877#discussion_r127322605
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
@@ -1377,7 +1386,7 @@ public void serialize(ColumnMetadata_v2 value, 
JsonGenerator jgen, SerializerPro
*
* Difference between v3 and v2 : min/max, type_length, precision, 
scale, repetitionLevel, definitionLevel
*/
-  @JsonTypeName("v3") public static class ParquetTableMetadata_v3 extends 
ParquetTableMetadataBase {
+  @JsonTypeName("v3_1") public static class ParquetTableMetadata_v3 
extends ParquetTableMetadataBase {
--- End diff --

Can be replaced with `Version` enum (ex: `Version.V1.getName()`), the same 
in `ParquetTableMetadataBase` class?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #877: DRILL-5660: Drill 1.10 queries fail due to Parquet ...

2017-07-13 Thread vdiravka
GitHub user vdiravka opened a pull request:

https://github.com/apache/drill/pull/877

DRILL-5660: Drill 1.10 queries fail due to Parquet Metadata "corrupti…

…on" from DRILL-3867

- bump up metadata file version to v3_1;
- ignoring of unknown metadata version (for example metadata generated from 
future versions of drill);
- adding removing "%20" symbols from path.

You can merge this pull request into a Git repository by running:

$ git pull https://github.com/vdiravka/drill DRILL-5660

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/drill/pull/877.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #877


commit 0d121022e16b5d316c357207706cc1454b77a5a0
Author: Vitalii Diravka 
Date:   2017-07-12T22:12:00Z

DRILL-5660: Drill 1.10 queries fail due to Parquet Metadata "corruption" 
from DRILL-3867

- bump up metadata file version to v3_1;
- ignoring of unknown metadata version (for example metadata generated from 
future versions of drill);
- adding removing "%20" symbols from path.




---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill issue #876: DRILL-5659: Fix error code checking in reading from socket

2017-07-13 Thread laurentgo
Github user laurentgo commented on the issue:

https://github.com/apache/drill/pull/876
  
I tested non-SASL case and confirm it fixes the issue. Unfortunately I 
don't have any handy SASL-ready setup for now.

Thanks @parthchandra 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #875: DRILL-5671 Set secure ACLs (Access Control List) fo...

2017-07-13 Thread adityakishore
Github user adityakishore commented on a diff in the pull request:

https://github.com/apache/drill/pull/875#discussion_r127295205
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/coord/zk/ZKSecureACLProvider.java
 ---
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.coord.zk;
+
+import com.google.common.collect.ImmutableList;
+import org.apache.curator.framework.api.ACLProvider;
+import org.apache.zookeeper.ZooDefs.Ids;
+import org.apache.zookeeper.data.ACL;
+
+import java.util.List;
+
+/**
+ * ZKSecureACLProvider restricts access to znodes created by Drill
+ * The cluster discovery znode i.e. the znode containing the list of 
Drillbits is
+ * readable by anyone.
+ * For all other znodes only the creator of the znode, i.e the Drillbit 
user, has full access.
+ */
+
+public class ZKSecureACLProvider implements ACLProvider {
+
+static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(ZKSecureACLProvider.class);
+// Creator has full access
+static ImmutableList DEFAULT_ACL = new 
ImmutableList.Builder()
+  
.addAll(Ids.CREATOR_ALL_ACL.iterator())
+  .build();
+// Creator has full access
+// Everyone else has only read access
+static ImmutableList DRILL_CLUSTER_ACL = new 
ImmutableList.Builder()
--- End diff --

Please use java-doc style comment block and mention why and where this ACL 
is used.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #875: DRILL-5671 Set secure ACLs (Access Control List) fo...

2017-07-13 Thread adityakishore
Github user adityakishore commented on a diff in the pull request:

https://github.com/apache/drill/pull/875#discussion_r127295586
  
--- Diff: exec/java-exec/src/main/resources/drill-module.conf ---
@@ -112,7 +112,8 @@ drill.exec: {
 retry: {
   count: 7200,
   delay: 500
-}
+},
+use.secure_acl: false
--- End diff --

This needs to be set to true in 
`/opt/mapr/drill/drill-1.11.0/conf/drill-distrib.conf` by `configure.sh` when 
security is enabled on a MapR cluster. Do we have a tracking work item for this?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


Re: Debugging Drill in Eclipse

2017-07-13 Thread Paul Rogers
Hi Muhammad,

There are several issues here.

First, the problem you describe is not related to Eclipse. Second, several of 
us do use Eclipse and it works fine. Third, there are far easier ways to debug 
Drill in Eclipse then building Drill and doing remote debugging.

First the error. The problem is that some bit of code is referring to a config 
parameter that does not appear in the config files. This kind of key would 
normally appear in some drill-module.conf file. (The file drill-override.conf 
is only for the very few config setting that you must modify for your site.)

My source is a bit different than your, but the line in question seems to be 
this one:

this.loop = 
TransportCheck.createEventLoopGroup(config.getInt(ExecConstants.BIT_SERVER_RPC_THREADS),
 "BitServer-“);

The config property here is defined in java-exec/src/main/resources 
drill-module.conf:

drill.exec: {
  ...
  rpc: {
user: {
  ...
  server: {
   ...
threads: 1,

The only way you will get the error in your stack is if, somehow, your build 
omits the drill-module.conf file. You can inspect your jar files to see if this 
file is present.

Second, Eclipse works fine. The only trick is if you try to run certain unit 
tests that use Mockito. “Modern” Eclipse Neon is based on JDK 8. But the 
Mockito tests only run on JDK 7. There is no way to get the test runner in 
Eclipse to use JDK 7. So, I end up building Drill for JDK 8 when using Eclipse 
(just change the Java version in the root pom.xml file from 1.7 to 1.8 — in two 
places.) Then run the Mockito-based tests outside of Eclipse, after rebuilding 
back on JDK 7. Yes, a hassle, but this is just the way that Drill works today.

Further, what works for me is:

1. Check out Drill
2. Change the pom.xml Java version number as noted above
3. Build all of Drill without tests: “mvn clean install -DskipTests”
4. Open Drill or refresh Drill in Eclipse. Eclipse does its own build.
5. Run Drill directly from Eclipse.

Item 5 above is the third item. For many purposes, it is far more convenient to 
run Drill directly from Eclipse. I use unit tests based on a newer test 
framework. Find ExampleTest.java for example. For example, if a particular 
query fails, and I can copy the data locally, then I just use something like 
“fourthTest” to set up a storage plugin, set required session, system and 
config options, run the query, and either display the results or as summary. 
You can set breakpoints in the lines in question and debug. Entire 
edit/compile/debug cycle is maybe 30 seconds vs. the five minutes if you do a 
full external build.

I hope some of this helps you resolve your issue.

The most practical solution:

1. Rebuild Drill
2. Retry the run without the debugger.

If that works, review your Eclipse settings that might affect class path.

Thanks,

- Paul


> On Jul 13, 2017, at 8:20 AM, Muhammad Gelbana  wrote:
> 
> To debug Drill in Eclipse, I ran *./drillbit.sh debug* and copied the VM
> args and environment variables into a launcher. This worked fine for months.
> 
> Obviously now I messed things up in a way and I can't debug Drill in
> Eclipse anymore. I'm facing the following error:
> 
> Exception in thread "main"
> org.apache.drill.exec.exception.DrillbitStartupException: Failure while
> initializing values in Drillbit.
> at org.apache.drill.exec.server.Drillbit.start(Drillbit.java:288)
> at org.apache.drill.exec.server.Drillbit.start(Drillbit.java:272)
> at org.apache.drill.exec.server.Drillbit.main(Drillbit.java:268)
> Caused by: com.typesafe.config.ConfigException$Missing: *No configuration
> setting found for key 'drill.exec.rpc'*
> at com.typesafe.config.impl.SimpleConfig.findKey(SimpleConfig.java:115)
> at com.typesafe.config.impl.SimpleConfig.find(SimpleConfig.java:138)
> at com.typesafe.config.impl.SimpleConfig.find(SimpleConfig.java:142)
> at com.typesafe.config.impl.SimpleConfig.find(SimpleConfig.java:142)
> at com.typesafe.config.impl.SimpleConfig.find(SimpleConfig.java:150)
> at com.typesafe.config.impl.SimpleConfig.find(SimpleConfig.java:155)
> at
> com.typesafe.config.impl.SimpleConfig.getConfigNumber(SimpleConfig.java:170)
> at com.typesafe.config.impl.SimpleConfig.getInt(SimpleConfig.java:181)
> at org.apache.drill.common.config.NestedConfig.getInt(NestedConfig.java:98)
> at org.apache.drill.common.config.DrillConfig.getInt(DrillConfig.java:1)
> at
> org.apache.drill.exec.server.BootStrapContext.(BootStrapContext.java:55)
> at org.apache.drill.exec.server.Drillbit.(Drillbit.java:94)
> at org.apache.drill.exec.server.Drillbit.start(Drillbit.java:286)
> ... 2 more
> 
> ​My *drill-override.conf*​ file has the following content (Aside from the
> comments)
> 
> drill.exec: {
>  cluster-id: "drillbits1",
>  zk.connect: "localhost:2181"
> }
> 
> I never needed to change this file to debug Drill in Eclipse !
> 
> Is there a standard way that Drill developers use to debug Drill ?
> 
> ​I appreciate any help for this because 

[GitHub] drill issue #876: DRILL-5659: Fix error code checking in reading from socket

2017-07-13 Thread superbstreak
Github user superbstreak commented on the issue:

https://github.com/apache/drill/pull/876
  
Thanks, @parthchandra!


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill issue #876: DRILL-5659: Fix error code checking in reading from socket

2017-07-13 Thread sohami
Github user sohami commented on the issue:

https://github.com/apache/drill/pull/876
  
+1 LGTM
Thanks a lot for catching and fixing this!. I will test it with SASL 
encryption case.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


Debugging Drill in Eclipse

2017-07-13 Thread Muhammad Gelbana
To debug Drill in Eclipse, I ran *./drillbit.sh debug* and copied the VM
args and environment variables into a launcher. This worked fine for months.

Obviously now I messed things up in a way and I can't debug Drill in
Eclipse anymore. I'm facing the following error:

Exception in thread "main"
org.apache.drill.exec.exception.DrillbitStartupException: Failure while
initializing values in Drillbit.
at org.apache.drill.exec.server.Drillbit.start(Drillbit.java:288)
at org.apache.drill.exec.server.Drillbit.start(Drillbit.java:272)
at org.apache.drill.exec.server.Drillbit.main(Drillbit.java:268)
Caused by: com.typesafe.config.ConfigException$Missing: *No configuration
setting found for key 'drill.exec.rpc'*
at com.typesafe.config.impl.SimpleConfig.findKey(SimpleConfig.java:115)
at com.typesafe.config.impl.SimpleConfig.find(SimpleConfig.java:138)
at com.typesafe.config.impl.SimpleConfig.find(SimpleConfig.java:142)
at com.typesafe.config.impl.SimpleConfig.find(SimpleConfig.java:142)
at com.typesafe.config.impl.SimpleConfig.find(SimpleConfig.java:150)
at com.typesafe.config.impl.SimpleConfig.find(SimpleConfig.java:155)
at
com.typesafe.config.impl.SimpleConfig.getConfigNumber(SimpleConfig.java:170)
at com.typesafe.config.impl.SimpleConfig.getInt(SimpleConfig.java:181)
at org.apache.drill.common.config.NestedConfig.getInt(NestedConfig.java:98)
at org.apache.drill.common.config.DrillConfig.getInt(DrillConfig.java:1)
at
org.apache.drill.exec.server.BootStrapContext.(BootStrapContext.java:55)
at org.apache.drill.exec.server.Drillbit.(Drillbit.java:94)
at org.apache.drill.exec.server.Drillbit.start(Drillbit.java:286)
... 2 more

​My *drill-override.conf*​ file has the following content (Aside from the
comments)

drill.exec: {
  cluster-id: "drillbits1",
  zk.connect: "localhost:2181"
}

I never needed to change this file to debug Drill in Eclipse !

Is there a standard way that Drill developers use to debug Drill ?

​I appreciate any help for this because it's totally blocking me !​


[GitHub] drill pull request #876: DRILL-5659: Fix error code checking in reading from...

2017-07-13 Thread parthchandra
GitHub user parthchandra opened a pull request:

https://github.com/apache/drill/pull/876

DRILL-5659: Fix error code checking in reading from socket

Check if errorcode is set before checking if the error code is EINTR. Also, 
check for error first thing after making the read_some or write_some calls.
This fixes the case when we read (or write) only a part of the data in a 
call to read from or write to the socket.
Tested for the issue on Mac/Centos. Not tested on Windows. Not tested for 
the case with SASL encryption.


You can merge this pull request into a Git repository by running:

$ git pull https://github.com/parthchandra/drill DRILL-5659

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/drill/pull/876.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #876


commit 0a29ced81dc935b926e734ea169eb884a2cbe40f
Author: Parth Chandra 
Date:   2017-07-12T21:39:17Z

DRILL-5659: Fix error code checking in reading from socket




---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


[GitHub] drill pull request #875: DRILL-5671 Set secure ACLs (Access Control List) fo...

2017-07-13 Thread bitblender
GitHub user bitblender opened a pull request:

https://github.com/apache/drill/pull/875

DRILL-5671   Set secure ACLs (Access Control List) for Drill ZK nodes in a 
secure cluster



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/bitblender/drill DRILL-5671-PR-2

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/drill/pull/875.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #875


commit 0b43d451df019026cc6d50128d229340580bb82e
Author: karthik 
Date:   2017-06-20T17:45:27Z

DRILL 5671 - Set secure ACLs (Access Control List) for Drill ZK nodes in a 
secure cluster




---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---


Re: [DISCUSS] Drill 1.11.0 release

2017-07-13 Thread Arina Yelchiyeva
*Blockers*
DRILL-5659 - problem is identified, Parth is working on the fix.

*Not blockers*
DRILL-5660 - Vitalii
DRILL-5468 - Jinfeng

*Possible blockers*
DRILL-5669 - Boaz Ben-Zvi / Paul
NPE on CSV - Paul

Kind regards
Arina


On Thu, Jul 13, 2017 at 1:28 AM, Paul Rogers  wrote:

> I withdraw my suggestion to fix DRILL-5660. Was a big hassle for us to
> track down the incompatibility the issue internally, but production users
> probably don’t attempt to use one build of Drill with metadata files
> created from another. We can worry about the issue later only if a user
> actually runs into it.
>
> As suggested earlier, let’s just document this obscure case in a release
> note.
>
> - Paul
>
> > On Jul 12, 2017, at 3:04 PM, Jinfeng Ni  wrote:
> >
> > Upgrade from 1.10 to 1.11 is irreversible only when parquet metadata
> cache
> > file is involved.
> >
> > I might be wrong, but I thought that if someone is running on 1.10.0, and
> > he wants to go back to 1.0.0, where the first version of parquet
> meatadata
> > cache was introduced, can we guarantee that Drillbit would work
> correctly?
> > I'm trying to understand what DRILL-5660 asks for is a new requirement,
> or
> > an requirement being there starting from Drill 1.0.0?
> >
> > Regression failures are normally regarded as release blocker.
> >
> >
> >
> > On Wed, Jul 12, 2017 at 2:54 PM, Paul Rogers  wrote:
> >
> >> If we don’t do DRILL-5660 for 1.11 then we don’t need to do it at all,
> so
> >> that is a time savings. We could handle this via a release note item
> that
> >> simply states that an upgrade from 1.10 to 1.11 is irreversible: we do
> not
> >> support the ability to fall back to the old version. Everyone OK with
> that?
> >>
> >> We are seeing some stress test regression failures that, I’m told, will
> >> show up as JIRA entries shortly.
> >>
> >> - Paul
> >>
> >>> On Jul 12, 2017, at 11:05 AM, Jinfeng Ni  wrote:
> >>>
> >>> I put some analysis in DRILL-5468.
> >>>
> >>> Also, IMHO, DRILL-5660 is not a release blocker.
> >>>
> >>>
> >>>
> >>> On Tue, Jul 11, 2017 at 11:27 AM, Parth Chandra 
> >> wrote:
> >>>
>  Re DRILL-5634:
>  From the Apache commons web page [1] -
> 
>  Please note that Apache Commons Crypto doesn't implement the
> >> cryptographic
>  algorithm such as AES directly.* It wraps to Openssl or JCE which
> >> implement
>  the algorithms.*
> 
> 
>  Since we are cleared for using Openssl and JCE, we are OK to include
> the
>  AES code in the UDFs.
> 
> 
>  Parth
> 
>  [1] https://commons.apache.org/proper/commons-crypto/
> 
>  On Tue, Jul 11, 2017 at 10:56 AM, Arina Yelchiyeva <
>  arina.yelchiy...@gmail.com> wrote:
> 
> > During today hangouts we agreed on approximate cut-off date - 14
> July,
>  2017
> > (date may be shifted due to the blockers).
> > I'll start preparing test RC on Thursday to solve any release
> >> preparation
> > issues beforehand.
> >
> > *Blockers*
> > DRILL-5660 - Vitalii
> >
> > *Possible blockers*
> > DRILL-5468 - Jinfeng
> > NPE on CSV source - Paul
> > DRILL-5659 - Parth
> >
> > *Will be included if done before the cut-off date*
> > DRILL-5616 / DRILL-5665 - Boaz
> > DRILL-5634 - Parth will take a look at Apache commons implementation
> >> and
> > update.
> >
> > Kind regards
> > Arina
> >
> > On Tue, Jul 11, 2017 at 3:14 PM, Charles Givre 
> >> wrote:
> >
> >> Hi Arina,
> >> I can’t make the hangout, but we have a few options.  Basically,
> since
> > all
> >> the hashing algorithms have no export restrictions, we could:
> >> 1.  Remove the AES encrypt/decrypt until I can figure out what we
> have
>  to
> >> do (and do it)
> >> 2.  I can try to research it today or tomorrow to figure out exactly
>  what
> >> we have to do to include AES.
> >> 3.  Remove the entire package
> >>
> >> I’d like to see the hashing functions at least included because
> we’ve
>  had
> >> a lot of requests for that functionality.   I also don’t want this
> to
> > hold
> >> things up, so please let me know your preference.  Regardless, I’m
>  going
> > to
> >> start trying to figure out the restrictions so we’ll know for the
>  future.
> >>
> >> Best,
> >> — C
> >>
> >>
> >>
> >>> On Jul 11, 2017, at 05:58, Arina Yelchiyeva <
> > arina.yelchiy...@gmail.com>
> >> wrote:
> >>>
> >>> @Jinfeng,
> >>> Let's discuss today on Drill hangout final cut-off date.
> >>>
> >>> @Charles
> >>> Since there were questions about AES in DRILL-5634, should we
> exclude
> > it
> >>> from the candidates for 1.11.0 release?
> >>>
> >>> @Boaz
> >>> Do we have any ETA when DRILL-5616 and DRILL-5665 will pass code
> 

[GitHub] drill pull request #865: DRILL-5634 - Add Crypto and Hash Functions

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/865#discussion_r127193228
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CryptoFunctions.java
 ---
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.SecretKeySpec;
+import javax.inject.Inject;
+
+public class CryptoFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(CryptoFunctions.class);
+
+  private CryptoFunctions() {
+  }
+
+  /**
+   * This class returns the md2 digest of a given input string.
+   *  Usage is SELECT md2(  ) FROM ...
+   */
+
+  @FunctionTemplate(name = "md2", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD2Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md2Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * This function returns the MD5 digest of a given input string.
+   *  Usage is shown below:
+   *  select md5( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(name = "md5", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD5Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md5Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * sha() / sha1(): Calculates an SHA-1 160-bit checksum for 
the string, as described in RFC 3174 (Secure Hash Algorithm).
+   * (https://en.wikipedia.org/wiki/SHA-1) The value is returned as a 
string of 40 hexadecimal digits, or NULL if the argument was NULL.
+   * Note that sha() and sha1() are aliases for the same function.
+   *
+   * > select sha1( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(names = {"sha", "sha1"}, scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class SHA1Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+

[GitHub] drill pull request #865: DRILL-5634 - Add Crypto and Hash Functions

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/865#discussion_r127193320
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CryptoFunctions.java
 ---
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.SecretKeySpec;
+import javax.inject.Inject;
+
+public class CryptoFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(CryptoFunctions.class);
+
+  private CryptoFunctions() {
+  }
+
+  /**
+   * This class returns the md2 digest of a given input string.
+   *  Usage is SELECT md2(  ) FROM ...
+   */
+
+  @FunctionTemplate(name = "md2", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD2Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md2Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * This function returns the MD5 digest of a given input string.
+   *  Usage is shown below:
+   *  select md5( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(name = "md5", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD5Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md5Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * sha() / sha1(): Calculates an SHA-1 160-bit checksum for 
the string, as described in RFC 3174 (Secure Hash Algorithm).
+   * (https://en.wikipedia.org/wiki/SHA-1) The value is returned as a 
string of 40 hexadecimal digits, or NULL if the argument was NULL.
+   * Note that sha() and sha1() are aliases for the same function.
+   *
+   * > select sha1( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(names = {"sha", "sha1"}, scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class SHA1Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+

[GitHub] drill pull request #865: DRILL-5634 - Add Crypto and Hash Functions

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/865#discussion_r127193028
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CryptoFunctions.java
 ---
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.SecretKeySpec;
+import javax.inject.Inject;
+
+public class CryptoFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(CryptoFunctions.class);
+
+  private CryptoFunctions() {
+  }
+
+  /**
+   * This class returns the md2 digest of a given input string.
+   *  Usage is SELECT md2(  ) FROM ...
+   */
+
+  @FunctionTemplate(name = "md2", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD2Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md2Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * This function returns the MD5 digest of a given input string.
+   *  Usage is shown below:
+   *  select md5( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(name = "md5", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD5Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md5Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * sha() / sha1(): Calculates an SHA-1 160-bit checksum for 
the string, as described in RFC 3174 (Secure Hash Algorithm).
+   * (https://en.wikipedia.org/wiki/SHA-1) The value is returned as a 
string of 40 hexadecimal digits, or NULL if the argument was NULL.
+   * Note that sha() and sha1() are aliases for the same function.
+   *
+   * > select sha1( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(names = {"sha", "sha1"}, scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class SHA1Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+

[GitHub] drill pull request #865: DRILL-5634 - Add Crypto and Hash Functions

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/865#discussion_r127193358
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CryptoFunctions.java
 ---
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.SecretKeySpec;
+import javax.inject.Inject;
+
+public class CryptoFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(CryptoFunctions.class);
+
+  private CryptoFunctions() {
+  }
+
+  /**
+   * This class returns the md2 digest of a given input string.
+   *  Usage is SELECT md2(  ) FROM ...
+   */
+
+  @FunctionTemplate(name = "md2", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD2Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md2Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * This function returns the MD5 digest of a given input string.
+   *  Usage is shown below:
+   *  select md5( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(name = "md5", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD5Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md5Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * sha() / sha1(): Calculates an SHA-1 160-bit checksum for 
the string, as described in RFC 3174 (Secure Hash Algorithm).
+   * (https://en.wikipedia.org/wiki/SHA-1) The value is returned as a 
string of 40 hexadecimal digits, or NULL if the argument was NULL.
+   * Note that sha() and sha1() are aliases for the same function.
+   *
+   * > select sha1( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(names = {"sha", "sha1"}, scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class SHA1Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+

[GitHub] drill pull request #865: DRILL-5634 - Add Crypto and Hash Functions

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/865#discussion_r127192939
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CryptoFunctions.java
 ---
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.SecretKeySpec;
+import javax.inject.Inject;
+
+public class CryptoFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(CryptoFunctions.class);
+
+  private CryptoFunctions() {
+  }
+
+  /**
+   * This class returns the md2 digest of a given input string.
+   *  Usage is SELECT md2(  ) FROM ...
+   */
+
+  @FunctionTemplate(name = "md2", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD2Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md2Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * This function returns the MD5 digest of a given input string.
+   *  Usage is shown below:
+   *  select md5( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(name = "md5", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD5Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md5Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * sha() / sha1(): Calculates an SHA-1 160-bit checksum for 
the string, as described in RFC 3174 (Secure Hash Algorithm).
+   * (https://en.wikipedia.org/wiki/SHA-1) The value is returned as a 
string of 40 hexadecimal digits, or NULL if the argument was NULL.
+   * Note that sha() and sha1() are aliases for the same function.
+   *
+   * > select sha1( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(names = {"sha", "sha1"}, scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class SHA1Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+

[GitHub] drill pull request #865: DRILL-5634 - Add Crypto and Hash Functions

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/865#discussion_r127193061
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CryptoFunctions.java
 ---
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.SecretKeySpec;
+import javax.inject.Inject;
+
+public class CryptoFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(CryptoFunctions.class);
+
+  private CryptoFunctions() {
+  }
+
+  /**
+   * This class returns the md2 digest of a given input string.
+   *  Usage is SELECT md2(  ) FROM ...
+   */
+
+  @FunctionTemplate(name = "md2", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD2Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md2Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * This function returns the MD5 digest of a given input string.
+   *  Usage is shown below:
+   *  select md5( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(name = "md5", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD5Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md5Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * sha() / sha1(): Calculates an SHA-1 160-bit checksum for 
the string, as described in RFC 3174 (Secure Hash Algorithm).
+   * (https://en.wikipedia.org/wiki/SHA-1) The value is returned as a 
string of 40 hexadecimal digits, or NULL if the argument was NULL.
+   * Note that sha() and sha1() are aliases for the same function.
+   *
+   * > select sha1( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(names = {"sha", "sha1"}, scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class SHA1Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+

[GitHub] drill pull request #865: DRILL-5634 - Add Crypto and Hash Functions

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/865#discussion_r127193243
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CryptoFunctions.java
 ---
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.SecretKeySpec;
+import javax.inject.Inject;
+
+public class CryptoFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(CryptoFunctions.class);
+
+  private CryptoFunctions() {
+  }
+
+  /**
+   * This class returns the md2 digest of a given input string.
+   *  Usage is SELECT md2(  ) FROM ...
+   */
+
+  @FunctionTemplate(name = "md2", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD2Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md2Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * This function returns the MD5 digest of a given input string.
+   *  Usage is shown below:
+   *  select md5( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(name = "md5", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MD5Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+@Override
+public void setup() {
+}
+
+@Override
+public void eval() {
+
+  String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+  String outputString = 
org.apache.commons.codec.digest.DigestUtils.md5Hex(input).toLowerCase();
+
+  out.buffer = buffer;
+  out.start = 0;
+  out.end = outputString.getBytes().length;
+  buffer.setBytes(0, outputString.getBytes());
+}
+
+  }
+
+  /**
+   * sha() / sha1(): Calculates an SHA-1 160-bit checksum for 
the string, as described in RFC 3174 (Secure Hash Algorithm).
+   * (https://en.wikipedia.org/wiki/SHA-1) The value is returned as a 
string of 40 hexadecimal digits, or NULL if the argument was NULL.
+   * Note that sha() and sha1() are aliases for the same function.
+   *
+   * > select sha1( 'testing' ) from (VALUES(1));
+   */
+
+  @FunctionTemplate(names = {"sha", "sha1"}, scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class SHA1Function implements DrillSimpleFunc {
+
+@Param
+VarCharHolder rawInput;
+
+@Output
+VarCharHolder out;
+
+@Inject
+DrillBuf buffer;
+
+

[GitHub] drill pull request #865: DRILL-5634 - Add Crypto and Hash Functions

2017-07-13 Thread arina-ielchiieva
Github user arina-ielchiieva commented on a diff in the pull request:

https://github.com/apache/drill/pull/865#discussion_r127193132
  
--- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CryptoFunctions.java
 ---
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.SecretKeySpec;
+import javax.inject.Inject;
+
+public class CryptoFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(CryptoFunctions.class);
--- End diff --

Can be removed since we don't need it anymore. See explanation below.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---