http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java b/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java new file mode 100644 index 0000000..90817a5 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java @@ -0,0 +1,354 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import java.util.Arrays; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; + +/** + * String expression evaluation helper functions. + */ +public class StringExpr { + + /* Compare two strings from two byte arrays each + * with their own start position and length. + * Use lexicographic unsigned byte value order. + * This is what's used for UTF-8 sort order. + * Return negative value if arg1 < arg2, 0 if arg1 = arg2, + * positive if arg1 > arg2. + */ + public static int compare(byte[] arg1, int start1, int len1, byte[] arg2, int start2, int len2) { + for (int i = 0; i < len1 && i < len2; i++) { + // Note the "& 0xff" is just a way to convert unsigned bytes to signed integer. + int b1 = arg1[i + start1] & 0xff; + int b2 = arg2[i + start2] & 0xff; + if (b1 != b2) { + return b1 - b2; + } + } + return len1 - len2; + } + + /* Determine if two strings are equal from two byte arrays each + * with their own start position and length. + * Use lexicographic unsigned byte value order. + * This is what's used for UTF-8 sort order. + */ + public static boolean equal(byte[] arg1, final int start1, final int len1, + byte[] arg2, final int start2, final int len2) { + if (len1 != len2) { + return false; + } + if (len1 == 0) { + return true; + } + + // do bounds check for OOB exception + if (arg1[start1] != arg2[start2] + || arg1[start1 + len1 - 1] != arg2[start2 + len2 - 1]) { + return false; + } + + if (len1 == len2) { + // prove invariant to the compiler: len1 = len2 + // all array access between (start1, start1+len1) + // and (start2, start2+len2) are valid + // no more OOB exceptions are possible + final int step = 8; + final int remainder = len1 % step; + final int wlen = len1 - remainder; + // suffix first + for (int i = wlen; i < len1; i++) { + if (arg1[start1 + i] != arg2[start2 + i]) { + return false; + } + } + // SIMD loop + for (int i = 0; i < wlen; i += step) { + final int s1 = start1 + i; + final int s2 = start2 + i; + boolean neq = false; + for (int j = 0; j < step; j++) { + neq = (arg1[s1 + j] != arg2[s2 + j]) || neq; + } + if (neq) { + return false; + } + } + } + + return true; + } + + public static int characterCount(byte[] bytes) { + int end = bytes.length; + + // count characters + int j = 0; + int charCount = 0; + while(j < end) { + // UTF-8 continuation bytes have 2 high bits equal to 0x80. + if ((bytes[j] & 0xc0) != 0x80) { + ++charCount; + } + j++; + } + return charCount; + } + + public static int characterCount(byte[] bytes, int start, int length) { + int end = start + length; + + // count characters + int j = start; + int charCount = 0; + while(j < end) { + // UTF-8 continuation bytes have 2 high bits equal to 0x80. + if ((bytes[j] & 0xc0) != 0x80) { + ++charCount; + } + j++; + } + return charCount; + } + + // A setVal with the same function signature as rightTrim, leftTrim, truncate, etc, below. + // Useful for class generation via templates. + public static void assign(BytesColumnVector outV, int i, byte[] bytes, int start, int length) { + // set output vector + outV.setVal(i, bytes, start, length); + } + + /* + * Right trim a slice of a byte array and return the new byte length. + */ + public static int rightTrim(byte[] bytes, int start, int length) { + // skip trailing blank characters + int j = start + length - 1; + while(j >= start && bytes[j] == 0x20) { + j--; + } + + return (j - start) + 1; + } + + /* + * Right trim a slice of a byte array and place the result into element i of a vector. + */ + public static void rightTrim(BytesColumnVector outV, int i, byte[] bytes, int start, int length) { + // skip trailing blank characters + int j = start + length - 1; + while(j >= start && bytes[j] == 0x20) { + j--; + } + + // set output vector + outV.setVal(i, bytes, start, (j - start) + 1); + } + + /* + * Truncate a slice of a byte array to a maximum number of characters and + * return the new byte length. + */ + public static int truncate(byte[] bytes, int start, int length, int maxLength) { + int end = start + length; + + // count characters forward + int j = start; + int charCount = 0; + while(j < end) { + // UTF-8 continuation bytes have 2 high bits equal to 0x80. + if ((bytes[j] & 0xc0) != 0x80) { + if (charCount == maxLength) { + break; + } + ++charCount; + } + j++; + } + return (j - start); + } + + /* + * Truncate a slice of a byte array to a maximum number of characters and + * place the result into element i of a vector. + */ + public static void truncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) { + int end = start + length; + + // count characters forward + int j = start; + int charCount = 0; + while(j < end) { + // UTF-8 continuation bytes have 2 high bits equal to 0x80. + if ((bytes[j] & 0xc0) != 0x80) { + if (charCount == maxLength) { + break; + } + ++charCount; + } + j++; + } + + // set output vector + outV.setVal(i, bytes, start, (j - start)); + } + + /* + * Truncate a byte array to a maximum number of characters and + * return a byte array with only truncated bytes. + */ + public static byte[] truncateScalar(byte[] bytes, int maxLength) { + int end = bytes.length; + + // count characters forward + int j = 0; + int charCount = 0; + while(j < end) { + // UTF-8 continuation bytes have 2 high bits equal to 0x80. + if ((bytes[j] & 0xc0) != 0x80) { + if (charCount == maxLength) { + break; + } + ++charCount; + } + j++; + } + if (j == end) { + return bytes; + } else { + return Arrays.copyOf(bytes, j); + } + } + + /* + * Right trim and truncate a slice of a byte array to a maximum number of characters and + * return the new byte length. + */ + public static int rightTrimAndTruncate(byte[] bytes, int start, int length, int maxLength) { + int end = start + length; + + // count characters forward and watch for final run of pads + int j = start; + int charCount = 0; + int padRunStart = -1; + while(j < end) { + // UTF-8 continuation bytes have 2 high bits equal to 0x80. + if ((bytes[j] & 0xc0) != 0x80) { + if (charCount == maxLength) { + break; + } + if (bytes[j] == 0x20) { + if (padRunStart == -1) { + padRunStart = j; + } + } else { + padRunStart = -1; + } + ++charCount; + } else { + padRunStart = -1; + } + j++; + } + if (padRunStart != -1) { + return (padRunStart - start); + } else { + return (j - start); + } + } + + /* + * Right trim and truncate a slice of a byte array to a maximum number of characters and + * place the result into element i of a vector. + */ + public static void rightTrimAndTruncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) { + int end = start + length; + + // count characters forward and watch for final run of pads + int j = start; + int charCount = 0; + int padRunStart = -1; + while(j < end) { + // UTF-8 continuation bytes have 2 high bits equal to 0x80. + if ((bytes[j] & 0xc0) != 0x80) { + if (charCount == maxLength) { + break; + } + if (bytes[j] == 0x20) { + if (padRunStart == -1) { + padRunStart = j; + } + } else { + padRunStart = -1; + } + ++charCount; + } else { + padRunStart = -1; + } + j++; + } + // set output vector + if (padRunStart != -1) { + outV.setVal(i, bytes, start, (padRunStart - start)); + } else { + outV.setVal(i, bytes, start, (j - start) ); + } + } + + /* + * Right trim and truncate a byte array to a maximum number of characters and + * return a byte array with only the trimmed and truncated bytes. + */ + public static byte[] rightTrimAndTruncateScalar(byte[] bytes, int maxLength) { + int end = bytes.length; + + // count characters forward and watch for final run of pads + int j = 0; + int charCount = 0; + int padRunStart = -1; + while(j < end) { + // UTF-8 continuation bytes have 2 high bits equal to 0x80. + if ((bytes[j] & 0xc0) != 0x80) { + if (charCount == maxLength) { + break; + } + if (bytes[j] == 0x20) { + if (padRunStart == -1) { + padRunStart = j; + } + } else { + padRunStart = -1; + } + ++charCount; + } else { + padRunStart = -1; + } + j++; + } + if (padRunStart != -1) { + return Arrays.copyOf(bytes, padRunStart); + } else if (j == end) { + return bytes; + } else { + return Arrays.copyOf(bytes, j); + } + } +}
http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/ExpressionTree.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/ExpressionTree.java b/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/ExpressionTree.java new file mode 100644 index 0000000..443083d --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/ExpressionTree.java @@ -0,0 +1,160 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * The inner representation of the SearchArgument. Most users should not + * need this interface, it is only for file formats that need to translate + * the SearchArgument into an internal form. + */ +public class ExpressionTree { + public enum Operator {OR, AND, NOT, LEAF, CONSTANT} + private final Operator operator; + private final List<ExpressionTree> children; + private int leaf; + private final SearchArgument.TruthValue constant; + + ExpressionTree() { + operator = null; + children = null; + leaf = 0; + constant = null; + } + + ExpressionTree(Operator op, ExpressionTree... kids) { + operator = op; + children = new ArrayList<ExpressionTree>(); + leaf = -1; + this.constant = null; + Collections.addAll(children, kids); + } + + ExpressionTree(int leaf) { + operator = Operator.LEAF; + children = null; + this.leaf = leaf; + this.constant = null; + } + + ExpressionTree(SearchArgument.TruthValue constant) { + operator = Operator.CONSTANT; + children = null; + this.leaf = -1; + this.constant = constant; + } + + ExpressionTree(ExpressionTree other) { + this.operator = other.operator; + if (other.children == null) { + this.children = null; + } else { + this.children = new ArrayList<ExpressionTree>(); + for(ExpressionTree child: other.children) { + children.add(new ExpressionTree(child)); + } + } + this.leaf = other.leaf; + this.constant = other.constant; + } + + public SearchArgument.TruthValue evaluate(SearchArgument.TruthValue[] leaves + ) { + SearchArgument.TruthValue result = null; + switch (operator) { + case OR: + for(ExpressionTree child: children) { + result = child.evaluate(leaves).or(result); + } + return result; + case AND: + for(ExpressionTree child: children) { + result = child.evaluate(leaves).and(result); + } + return result; + case NOT: + return children.get(0).evaluate(leaves).not(); + case LEAF: + return leaves[leaf]; + case CONSTANT: + return constant; + default: + throw new IllegalStateException("Unknown operator: " + operator); + } + } + + @Override + public String toString() { + StringBuilder buffer = new StringBuilder(); + switch (operator) { + case OR: + buffer.append("(or"); + for(ExpressionTree child: children) { + buffer.append(' '); + buffer.append(child.toString()); + } + buffer.append(')'); + break; + case AND: + buffer.append("(and"); + for(ExpressionTree child: children) { + buffer.append(' '); + buffer.append(child.toString()); + } + buffer.append(')'); + break; + case NOT: + buffer.append("(not "); + buffer.append(children.get(0)); + buffer.append(')'); + break; + case LEAF: + buffer.append("leaf-"); + buffer.append(leaf); + break; + case CONSTANT: + buffer.append(constant); + break; + } + return buffer.toString(); + } + + public Operator getOperator() { + return operator; + } + + public List<ExpressionTree> getChildren() { + return children; + } + + public SearchArgument.TruthValue getConstant() { + return constant; + } + + public int getLeaf() { + return leaf; + } + + public void setLeaf(int leaf) { + this.leaf = leaf; + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java b/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java new file mode 100644 index 0000000..469a3da --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java @@ -0,0 +1,102 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +import java.sql.Date; +import java.sql.Timestamp; +import java.util.List; + +/** + * The primitive predicates that form a SearchArgument. + */ +public interface PredicateLeaf { + + /** + * The possible operators for predicates. To get the opposites, construct + * an expression with a not operator. + */ + public static enum Operator { + EQUALS, + NULL_SAFE_EQUALS, + LESS_THAN, + LESS_THAN_EQUALS, + IN, + BETWEEN, + IS_NULL + } + + /** + * The possible types for sargs. + */ + public static enum Type { + LONG(Long.class), // all of the integer types + FLOAT(Double.class), // float and double + STRING(String.class), // string, char, varchar + DATE(Date.class), + DECIMAL(HiveDecimalWritable.class), + TIMESTAMP(Timestamp.class), + BOOLEAN(Boolean.class); + + private final Class cls; + Type(Class cls) { + this.cls = cls; + } + + /** + * For all SARG leaves, the values must be the matching class. + * @return the value class + */ + public Class getValueClass() { + return cls; + } + } + + /** + * Get the operator for the leaf. + */ + public Operator getOperator(); + + /** + * Get the type of the column and literal by the file format. + */ + public Type getType(); + + /** + * Get the simple column name. + * @return the column name + */ + public String getColumnName(); + + /** + * Get the literal half of the predicate leaf. Adapt the original type for what orc needs + * + * @return an Integer, Long, Double, or String + */ + public Object getLiteral(); + + /** + * For operators with multiple literals (IN and BETWEEN), get the literals. + * + * @return the list of literals (Integer, Longs, Doubles, or Strings) + * + */ + public List<Object> getLiteralList(); +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java b/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java new file mode 100644 index 0000000..d70b3b0 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java @@ -0,0 +1,287 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +import java.util.List; + +/** + * Primary interface for <a href="http://en.wikipedia.org/wiki/Sargable"> + * SearchArgument</a>, which are the subset of predicates + * that can be pushed down to the RecordReader. Each SearchArgument consists + * of a series of SearchClauses that must each be true for the row to be + * accepted by the filter. + * + * This requires that the filter be normalized into conjunctive normal form + * (<a href="http://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF</a>). + */ +public interface SearchArgument { + + /** + * The potential result sets of logical operations. + */ + public static enum TruthValue { + YES, NO, NULL, YES_NULL, NO_NULL, YES_NO, YES_NO_NULL; + + /** + * Compute logical or between the two values. + * @param right the other argument or null + * @return the result + */ + public TruthValue or(TruthValue right) { + if (right == null || right == this) { + return this; + } + if (right == YES || this == YES) { + return YES; + } + if (right == YES_NULL || this == YES_NULL) { + return YES_NULL; + } + if (right == NO) { + return this; + } + if (this == NO) { + return right; + } + if (this == NULL) { + if (right == NO_NULL) { + return NULL; + } else { + return YES_NULL; + } + } + if (right == NULL) { + if (this == NO_NULL) { + return NULL; + } else { + return YES_NULL; + } + } + return YES_NO_NULL; + } + + /** + * Compute logical AND between the two values. + * @param right the other argument or null + * @return the result + */ + public TruthValue and(TruthValue right) { + if (right == null || right == this) { + return this; + } + if (right == NO || this == NO) { + return NO; + } + if (right == NO_NULL || this == NO_NULL) { + return NO_NULL; + } + if (right == YES) { + return this; + } + if (this == YES) { + return right; + } + if (this == NULL) { + if (right == YES_NULL) { + return NULL; + } else { + return NO_NULL; + } + } + if (right == NULL) { + if (this == YES_NULL) { + return NULL; + } else { + return NO_NULL; + } + } + return YES_NO_NULL; + } + + public TruthValue not() { + switch (this) { + case NO: + return YES; + case YES: + return NO; + case NULL: + case YES_NO: + case YES_NO_NULL: + return this; + case NO_NULL: + return YES_NULL; + case YES_NULL: + return NO_NULL; + default: + throw new IllegalArgumentException("Unknown value: " + this); + } + } + + /** + * Does the RecordReader need to include this set of records? + * @return true unless none of the rows qualify + */ + public boolean isNeeded() { + switch (this) { + case NO: + case NULL: + case NO_NULL: + return false; + default: + return true; + } + } + } + + /** + * Get the leaf predicates that are required to evaluate the predicate. The + * list will have the duplicates removed. + * @return the list of leaf predicates + */ + public List<PredicateLeaf> getLeaves(); + + /** + * Get the expression tree. This should only needed for file formats that + * need to translate the expression to an internal form. + */ + public ExpressionTree getExpression(); + + /** + * Evaluate the entire predicate based on the values for the leaf predicates. + * @param leaves the value of each leaf predicate + * @return the value of hte entire predicate + */ + public TruthValue evaluate(TruthValue[] leaves); + + /** + * A builder object for contexts outside of Hive where it isn't easy to + * get a ExprNodeDesc. The user must call startOr, startAnd, or startNot + * before adding any leaves. + */ + public interface Builder { + + /** + * Start building an or operation and push it on the stack. + * @return this + */ + public Builder startOr(); + + /** + * Start building an and operation and push it on the stack. + * @return this + */ + public Builder startAnd(); + + /** + * Start building a not operation and push it on the stack. + * @return this + */ + public Builder startNot(); + + /** + * Finish the current operation and pop it off of the stack. Each start + * call must have a matching end. + * @return this + */ + public Builder end(); + + /** + * Add a less than leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param literal the literal + * @return this + */ + public Builder lessThan(String column, PredicateLeaf.Type type, + Object literal); + + /** + * Add a less than equals leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param literal the literal + * @return this + */ + public Builder lessThanEquals(String column, PredicateLeaf.Type type, + Object literal); + + /** + * Add an equals leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param literal the literal + * @return this + */ + public Builder equals(String column, PredicateLeaf.Type type, + Object literal); + + /** + * Add a null safe equals leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param literal the literal + * @return this + */ + public Builder nullSafeEquals(String column, PredicateLeaf.Type type, + Object literal); + + /** + * Add an in leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param literal the literal + * @return this + */ + public Builder in(String column, PredicateLeaf.Type type, + Object... literal); + + /** + * Add an is null leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @return this + */ + public Builder isNull(String column, PredicateLeaf.Type type); + + /** + * Add a between leaf to the current item on the stack. + * @param column the name of the column + * @param type the type of the expression + * @param lower the literal + * @param upper the literal + * @return this + */ + public Builder between(String column, PredicateLeaf.Type type, + Object lower, Object upper); + + /** + * Add a truth value to the expression. + * @param truth + * @return this + */ + public Builder literal(TruthValue truth); + + /** + * Build and return the SearchArgument that has been defined. All of the + * starts must have been ended before this call. + * @return the new SearchArgument + */ + public SearchArgument build(); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java b/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java new file mode 100644 index 0000000..8fda95c --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +/** + * A factory for creating SearchArguments, as well as modifying those created by this factory. + */ +public class SearchArgumentFactory { + public static SearchArgument.Builder newBuilder() { + return new SearchArgumentImpl.BuilderImpl(); + } + public static void setPredicateLeafColumn(PredicateLeaf leaf, String newName) { + SearchArgumentImpl.PredicateLeafImpl.setColumnName(leaf, newName); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java b/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java new file mode 100644 index 0000000..10d8c51 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java @@ -0,0 +1,703 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +import java.sql.Timestamp; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Deque; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; + +/** + * The implementation of SearchArguments. Visible for testing only. + */ +public final class SearchArgumentImpl implements SearchArgument { + + public static final class PredicateLeafImpl implements PredicateLeaf { + private final Operator operator; + private final Type type; + private String columnName; + private final Object literal; + private final List<Object> literalList; + + // Used by kryo + @SuppressWarnings("unused") + PredicateLeafImpl() { + operator = null; + type = null; + columnName = null; + literal = null; + literalList = null; + } + + public PredicateLeafImpl(Operator operator, + Type type, + String columnName, + Object literal, + List<Object> literalList) { + this.operator = operator; + this.type = type; + this.columnName = columnName; + this.literal = literal; + if (literal != null) { + if (literal.getClass() != type.getValueClass()) { + throw new IllegalArgumentException("Wrong value class " + + literal.getClass().getName() + " for " + type + "." + operator + + " leaf"); + } + } + this.literalList = literalList; + if (literalList != null) { + Class valueCls = type.getValueClass(); + for(Object lit: literalList) { + if (lit != null && lit.getClass() != valueCls) { + throw new IllegalArgumentException("Wrong value class item " + + lit.getClass().getName() + " for " + type + "." + operator + + " leaf"); + } + } + } + } + + @Override + public Operator getOperator() { + return operator; + } + + @Override + public Type getType(){ + return type; + } + + @Override + public String getColumnName() { + return columnName; + } + + @Override + public Object getLiteral() { + // To get around a kryo 2.22 bug while deserialize a Timestamp into Date + // (https://github.com/EsotericSoftware/kryo/issues/88) + // When we see a Date, convert back into Timestamp + if (literal instanceof java.util.Date) { + return new Timestamp(((java.util.Date)literal).getTime()); + } + return literal; + } + + @Override + public List<Object> getLiteralList() { + return literalList; + } + + @Override + public String toString() { + StringBuilder buffer = new StringBuilder(); + buffer.append('('); + buffer.append(operator); + buffer.append(' '); + buffer.append(columnName); + if (literal != null) { + buffer.append(' '); + buffer.append(literal); + } else if (literalList != null) { + for(Object lit: literalList) { + buffer.append(' '); + buffer.append(lit == null ? "null" : lit.toString()); + } + } + buffer.append(')'); + return buffer.toString(); + } + + private static boolean isEqual(Object left, Object right) { + + return left == right || + (left != null && right != null && left.equals(right)); + } + + @Override + public boolean equals(Object other) { + if (other == null || other.getClass() != getClass()) { + return false; + } else if (other == this) { + return true; + } else { + PredicateLeafImpl o = (PredicateLeafImpl) other; + return operator == o.operator && + type == o.type && + columnName.equals(o.columnName) && + isEqual(literal, o.literal) && + isEqual(literalList, o.literalList); + } + } + + @Override + public int hashCode() { + return operator.hashCode() + + type.hashCode() * 17 + + columnName.hashCode() * 3 * 17+ + (literal == null ? 0 : literal.hashCode()) * 101 * 3 * 17 + + (literalList == null ? 0 : literalList.hashCode()) * + 103 * 101 * 3 * 17; + } + + public static void setColumnName(PredicateLeaf leaf, String newName) { + assert leaf instanceof PredicateLeafImpl; + ((PredicateLeafImpl)leaf).columnName = newName; + } + } + + private final List<PredicateLeaf> leaves; + private final ExpressionTree expression; + + SearchArgumentImpl(ExpressionTree expression, List<PredicateLeaf> leaves) { + this.expression = expression; + this.leaves = leaves; + } + + // Used by kyro + @SuppressWarnings("unused") + SearchArgumentImpl() { + leaves = null; + expression = null; + } + + @Override + public List<PredicateLeaf> getLeaves() { + return leaves; + } + + @Override + public TruthValue evaluate(TruthValue[] leaves) { + return expression == null ? TruthValue.YES : expression.evaluate(leaves); + } + + @Override + public ExpressionTree getExpression() { + return expression; + } + + @Override + public String toString() { + StringBuilder buffer = new StringBuilder(); + for(int i=0; i < leaves.size(); ++i) { + buffer.append("leaf-"); + buffer.append(i); + buffer.append(" = "); + buffer.append(leaves.get(i).toString()); + buffer.append(", "); + } + buffer.append("expr = "); + buffer.append(expression); + return buffer.toString(); + } + + static class BuilderImpl implements Builder { + + // max threshold for CNF conversion. having >8 elements in andList will be + // converted to maybe + private static final int CNF_COMBINATIONS_THRESHOLD = 256; + + private final Deque<ExpressionTree> currentTree = + new ArrayDeque<ExpressionTree>(); + private final Map<PredicateLeaf, Integer> leaves = + new HashMap<PredicateLeaf, Integer>(); + private final ExpressionTree root = + new ExpressionTree(ExpressionTree.Operator.AND); + { + currentTree.add(root); + } + + @Override + public Builder startOr() { + ExpressionTree node = new ExpressionTree(ExpressionTree.Operator.OR); + currentTree.getFirst().getChildren().add(node); + currentTree.addFirst(node); + return this; + } + + @Override + public Builder startAnd() { + ExpressionTree node = new ExpressionTree(ExpressionTree.Operator.AND); + currentTree.getFirst().getChildren().add(node); + currentTree.addFirst(node); + return this; + } + + @Override + public Builder startNot() { + ExpressionTree node = new ExpressionTree(ExpressionTree.Operator.NOT); + currentTree.getFirst().getChildren().add(node); + currentTree.addFirst(node); + return this; + } + + @Override + public Builder end() { + ExpressionTree current = currentTree.removeFirst(); + if (current.getChildren().size() == 0) { + throw new IllegalArgumentException("Can't create expression " + root + + " with no children."); + } + if (current.getOperator() == ExpressionTree.Operator.NOT && + current.getChildren().size() != 1) { + throw new IllegalArgumentException("Can't create not expression " + + current + " with more than 1 child."); + } + return this; + } + + private int addLeaf(PredicateLeaf leaf) { + Integer result = leaves.get(leaf); + if (result == null) { + int id = leaves.size(); + leaves.put(leaf, id); + return id; + } else { + return result; + } + } + + @Override + public Builder lessThan(String column, PredicateLeaf.Type type, + Object literal) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || literal == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.LESS_THAN, + type, column, literal, null); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder lessThanEquals(String column, PredicateLeaf.Type type, + Object literal) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || literal == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.LESS_THAN_EQUALS, + type, column, literal, null); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder equals(String column, PredicateLeaf.Type type, + Object literal) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || literal == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.EQUALS, + type, column, literal, null); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder nullSafeEquals(String column, PredicateLeaf.Type type, + Object literal) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || literal == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.NULL_SAFE_EQUALS, + type, column, literal, null); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder in(String column, PredicateLeaf.Type type, + Object... literal) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || literal == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + if (literal.length == 0) { + throw new IllegalArgumentException("Can't create in expression with " + + "no arguments"); + } + List<Object> argList = new ArrayList<Object>(); + argList.addAll(Arrays.asList(literal)); + + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.IN, + type, column, null, argList); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder isNull(String column, PredicateLeaf.Type type) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.IS_NULL, + type, column, null, null); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder between(String column, PredicateLeaf.Type type, Object lower, + Object upper) { + ExpressionTree parent = currentTree.getFirst(); + if (column == null || lower == null || upper == null) { + parent.getChildren().add(new ExpressionTree(TruthValue.YES_NO_NULL)); + } else { + List<Object> argList = new ArrayList<Object>(); + argList.add(lower); + argList.add(upper); + PredicateLeaf leaf = + new PredicateLeafImpl(PredicateLeaf.Operator.BETWEEN, + type, column, null, argList); + parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); + } + return this; + } + + @Override + public Builder literal(TruthValue truth) { + ExpressionTree parent = currentTree.getFirst(); + parent.getChildren().add(new ExpressionTree(truth)); + return this; + } + + /** + * Recursively explore the tree to find the leaves that are still reachable + * after optimizations. + * @param tree the node to check next + * @param next the next available leaf id + * @param leafReorder + * @return the next available leaf id + */ + static int compactLeaves(ExpressionTree tree, int next, int[] leafReorder) { + if (tree.getOperator() == ExpressionTree.Operator.LEAF) { + int oldLeaf = tree.getLeaf(); + if (leafReorder[oldLeaf] == -1) { + leafReorder[oldLeaf] = next++; + } + } else if (tree.getChildren() != null){ + for(ExpressionTree child: tree.getChildren()) { + next = compactLeaves(child, next, leafReorder); + } + } + return next; + } + + /** + * Rewrite expression tree to update the leaves. + * @param root the root of the tree to fix + * @param leafReorder a map from old leaf ids to new leaf ids + * @return the fixed root + */ + static ExpressionTree rewriteLeaves(ExpressionTree root, + int[] leafReorder) { + // The leaves could be shared in the tree. Use Set to remove the duplicates. + Set<ExpressionTree> leaves = new HashSet<ExpressionTree>(); + Queue<ExpressionTree> nodes = new LinkedList<ExpressionTree>(); + nodes.add(root); + + while(!nodes.isEmpty()) { + ExpressionTree node = nodes.remove(); + if (node.getOperator() == ExpressionTree.Operator.LEAF) { + leaves.add(node); + } else { + if (node.getChildren() != null){ + nodes.addAll(node.getChildren()); + } + } + } + + // Update the leaf in place + for(ExpressionTree leaf : leaves) { + leaf.setLeaf(leafReorder[leaf.getLeaf()]); + } + + return root; + } + + @Override + public SearchArgument build() { + if (currentTree.size() != 1) { + throw new IllegalArgumentException("Failed to end " + + currentTree.size() + " operations."); + } + ExpressionTree optimized = pushDownNot(root); + optimized = foldMaybe(optimized); + optimized = flatten(optimized); + optimized = convertToCNF(optimized); + optimized = flatten(optimized); + int leafReorder[] = new int[leaves.size()]; + Arrays.fill(leafReorder, -1); + int newLeafCount = compactLeaves(optimized, 0, leafReorder); + optimized = rewriteLeaves(optimized, leafReorder); + ArrayList<PredicateLeaf> leafList = new ArrayList<>(newLeafCount); + // expand list to correct size + for(int i=0; i < newLeafCount; ++i) { + leafList.add(null); + } + // build the new list + for(Map.Entry<PredicateLeaf, Integer> elem: leaves.entrySet()) { + int newLoc = leafReorder[elem.getValue()]; + if (newLoc != -1) { + leafList.set(newLoc, elem.getKey()); + } + } + return new SearchArgumentImpl(optimized, leafList); + } + + /** + * Push the negations all the way to just before the leaves. Also remove + * double negatives. + * @param root the expression to normalize + * @return the normalized expression, which may share some or all of the + * nodes of the original expression. + */ + static ExpressionTree pushDownNot(ExpressionTree root) { + if (root.getOperator() == ExpressionTree.Operator.NOT) { + ExpressionTree child = root.getChildren().get(0); + switch (child.getOperator()) { + case NOT: + return pushDownNot(child.getChildren().get(0)); + case CONSTANT: + return new ExpressionTree(child.getConstant().not()); + case AND: + root = new ExpressionTree(ExpressionTree.Operator.OR); + for(ExpressionTree kid: child.getChildren()) { + root.getChildren().add(pushDownNot(new + ExpressionTree(ExpressionTree.Operator.NOT, kid))); + } + break; + case OR: + root = new ExpressionTree(ExpressionTree.Operator.AND); + for(ExpressionTree kid: child.getChildren()) { + root.getChildren().add(pushDownNot(new ExpressionTree + (ExpressionTree.Operator.NOT, kid))); + } + break; + // for leaf, we don't do anything + default: + break; + } + } else if (root.getChildren() != null) { + // iterate through children and push down not for each one + for(int i=0; i < root.getChildren().size(); ++i) { + root.getChildren().set(i, pushDownNot(root.getChildren().get(i))); + } + } + return root; + } + + /** + * Remove MAYBE values from the expression. If they are in an AND operator, + * they are dropped. If they are in an OR operator, they kill their parent. + * This assumes that pushDownNot has already been called. + * @param expr The expression to clean up + * @return The cleaned up expression + */ + static ExpressionTree foldMaybe(ExpressionTree expr) { + if (expr.getChildren() != null) { + for(int i=0; i < expr.getChildren().size(); ++i) { + ExpressionTree child = foldMaybe(expr.getChildren().get(i)); + if (child.getConstant() == TruthValue.YES_NO_NULL) { + switch (expr.getOperator()) { + case AND: + expr.getChildren().remove(i); + i -= 1; + break; + case OR: + // a maybe will kill the or condition + return child; + default: + throw new IllegalStateException("Got a maybe as child of " + + expr); + } + } else { + expr.getChildren().set(i, child); + } + } + if (expr.getChildren().isEmpty()) { + return new ExpressionTree(TruthValue.YES_NO_NULL); + } + } + return expr; + } + + /** + * Converts multi-level ands and ors into single level ones. + * @param root the expression to flatten + * @return the flattened expression, which will always be root with + * potentially modified children. + */ + static ExpressionTree flatten(ExpressionTree root) { + if (root.getChildren() != null) { + // iterate through the index, so that if we add more children, + // they don't get re-visited + for(int i=0; i < root.getChildren().size(); ++i) { + ExpressionTree child = flatten(root.getChildren().get(i)); + // do we need to flatten? + if (child.getOperator() == root.getOperator() && + child.getOperator() != ExpressionTree.Operator.NOT) { + boolean first = true; + for(ExpressionTree grandkid: child.getChildren()) { + // for the first grandkid replace the original parent + if (first) { + first = false; + root.getChildren().set(i, grandkid); + } else { + root.getChildren().add(++i, grandkid); + } + } + } else { + root.getChildren().set(i, child); + } + } + // if we have a singleton AND or OR, just return the child + if ((root.getOperator() == ExpressionTree.Operator.OR || + root.getOperator() == ExpressionTree.Operator.AND) && + root.getChildren().size() == 1) { + return root.getChildren().get(0); + } + } + return root; + } + + /** + * Generate all combinations of items on the andList. For each item on the + * andList, it generates all combinations of one child from each and + * expression. Thus, (and a b) (and c d) will be expanded to: (or a c) + * (or a d) (or b c) (or b d). If there are items on the nonAndList, they + * are added to each or expression. + * @param result a list to put the results onto + * @param andList a list of and expressions + * @param nonAndList a list of non-and expressions + */ + private static void generateAllCombinations(List<ExpressionTree> result, + List<ExpressionTree> andList, + List<ExpressionTree> nonAndList + ) { + List<ExpressionTree> kids = andList.get(0).getChildren(); + if (result.isEmpty()) { + for(ExpressionTree kid: kids) { + ExpressionTree or = new ExpressionTree(ExpressionTree.Operator.OR); + result.add(or); + for(ExpressionTree node: nonAndList) { + or.getChildren().add(new ExpressionTree(node)); + } + or.getChildren().add(kid); + } + } else { + List<ExpressionTree> work = new ArrayList<ExpressionTree>(result); + result.clear(); + for(ExpressionTree kid: kids) { + for(ExpressionTree or: work) { + ExpressionTree copy = new ExpressionTree(or); + copy.getChildren().add(kid); + result.add(copy); + } + } + } + if (andList.size() > 1) { + generateAllCombinations(result, andList.subList(1, andList.size()), + nonAndList); + } + } + + /** + * Convert an expression so that the top level operator is AND with OR + * operators under it. This routine assumes that all of the NOT operators + * have been pushed to the leaves via pushdDownNot. + * @param root the expression + * @return the normalized expression + */ + static ExpressionTree convertToCNF(ExpressionTree root) { + if (root.getChildren() != null) { + // convert all of the children to CNF + int size = root.getChildren().size(); + for(int i=0; i < size; ++i) { + root.getChildren().set(i, convertToCNF(root.getChildren().get(i))); + } + if (root.getOperator() == ExpressionTree.Operator.OR) { + // a list of leaves that weren't under AND expressions + List<ExpressionTree> nonAndList = new ArrayList<ExpressionTree>(); + // a list of AND expressions that we need to distribute + List<ExpressionTree> andList = new ArrayList<ExpressionTree>(); + for(ExpressionTree child: root.getChildren()) { + if (child.getOperator() == ExpressionTree.Operator.AND) { + andList.add(child); + } else if (child.getOperator() == ExpressionTree.Operator.OR) { + // pull apart the kids of the OR expression + for(ExpressionTree grandkid: child.getChildren()) { + nonAndList.add(grandkid); + } + } else { + nonAndList.add(child); + } + } + if (!andList.isEmpty()) { + if (checkCombinationsThreshold(andList)) { + root = new ExpressionTree(ExpressionTree.Operator.AND); + generateAllCombinations(root.getChildren(), andList, nonAndList); + } else { + root = new ExpressionTree(TruthValue.YES_NO_NULL); + } + } + } + } + return root; + } + + private static boolean checkCombinationsThreshold(List<ExpressionTree> andList) { + int numComb = 1; + for (ExpressionTree tree : andList) { + numComb *= tree.getChildren().size(); + if (numComb > CNF_COMBINATIONS_THRESHOLD) { + return false; + } + } + return true; + } + + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java b/java/storage-api/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java new file mode 100644 index 0000000..151f30d --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java @@ -0,0 +1,335 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.util; + +/** + * Estimation of memory footprint of object + */ +public enum JavaDataModel { + + JAVA32 { + @Override + public int object() { + return JAVA32_OBJECT; + } + + @Override + public int array() { + return JAVA32_ARRAY; + } + + @Override + public int ref() { + return JAVA32_REF; + } + + @Override + public int hashMap(int entry) { + // base = JAVA32_OBJECT + PRIMITIVES1 * 4 + JAVA32_FIELDREF * 3 + JAVA32_ARRAY; + // entry = JAVA32_OBJECT + JAVA32_FIELDREF + PRIMITIVES1 + return hashMapBase() + hashMapEntry() * entry; + } + + @Override + public int hashMapBase() { + return 64; + } + + @Override + public int hashMapEntry() { + return 24; + } + + @Override + public int hashSet(int entry) { + // hashMap += JAVA32_OBJECT + return hashSetBase() + hashSetEntry() * entry; + } + + @Override + public int hashSetBase() { + return 80; + } + + @Override + public int hashSetEntry() { + return 24; + } + + @Override + public int linkedHashMap(int entry) { + // hashMap += JAVA32_FIELDREF + PRIMITIVES1 + // hashMap.entry += JAVA32_FIELDREF * 2 + return 72 + 32 * entry; + } + + @Override + public int linkedList(int entry) { + // base = JAVA32_OBJECT + PRIMITIVES1 * 2 + JAVA32_FIELDREF; + // entry = JAVA32_OBJECT + JAVA32_FIELDREF * 2 + return linkedListBase() + linkedListEntry() * entry; + } + + @Override + public int linkedListBase() { + return 28; + } + + @Override + public int linkedListEntry() { + return 24; + } + + @Override + public int arrayList() { + // JAVA32_OBJECT + PRIMITIVES1 * 2 + JAVA32_ARRAY; + return 44; + } + + @Override + public int memoryAlign() { + return 8; + } + }, JAVA64 { + @Override + public int object() { + return JAVA64_OBJECT; + } + + @Override + public int array() { + return JAVA64_ARRAY; + } + + @Override + public int ref() { + return JAVA64_REF; + } + + @Override + public int hashMap(int entry) { + // base = JAVA64_OBJECT + PRIMITIVES1 * 4 + JAVA64_FIELDREF * 3 + JAVA64_ARRAY; + // entry = JAVA64_OBJECT + JAVA64_FIELDREF + PRIMITIVES1 + return hashMapBase() + hashMapEntry() * entry; + } + + @Override + public int hashMapBase() { + return 112; + } + + + @Override + public int hashMapEntry() { + return 44; + } + + @Override + public int hashSet(int entry) { + // hashMap += JAVA64_OBJECT + return hashSetBase() + hashSetEntry() * entry; + } + + @Override + public int hashSetBase() { + return 144; + } + + @Override + public int hashSetEntry() { + return 44; + } + + @Override + public int linkedHashMap(int entry) { + // hashMap += JAVA64_FIELDREF + PRIMITIVES1 + // hashMap.entry += JAVA64_FIELDREF * 2 + return 128 + 60 * entry; + } + + @Override + public int linkedList(int entry) { + // base = JAVA64_OBJECT + PRIMITIVES1 * 2 + JAVA64_FIELDREF; + // entry = JAVA64_OBJECT + JAVA64_FIELDREF * 2 + return linkedListBase() + linkedListEntry() * entry; + } + + @Override + public int linkedListBase() { + return 48; + } + + @Override + public int linkedListEntry() { + return 48; + } + + @Override + public int arrayList() { + // JAVA64_OBJECT + PRIMITIVES1 * 2 + JAVA64_ARRAY; + return 80; + } + + @Override + public int memoryAlign() { + return 8; + } + }; + + public abstract int object(); + public abstract int array(); + public abstract int ref(); + public abstract int hashMap(int entry); + public abstract int hashMapBase(); + public abstract int hashMapEntry(); + public abstract int hashSetBase(); + public abstract int hashSetEntry(); + public abstract int hashSet(int entry); + public abstract int linkedHashMap(int entry); + public abstract int linkedListBase(); + public abstract int linkedListEntry(); + public abstract int linkedList(int entry); + public abstract int arrayList(); + public abstract int memoryAlign(); + + // ascii string + public int lengthFor(String string) { + return lengthForStringOfLength(string.length()); + } + + public int lengthForRandom() { + // boolean + double + AtomicLong + return object() + primitive1() + primitive2() + object() + primitive2(); + } + + public int primitive1() { + return PRIMITIVES1; + } + public int primitive2() { + return PRIMITIVES2; + } + + public static int alignUp(int value, int align) { + return (value + align - 1) & ~(align - 1); + } + + public static final int JAVA32_META = 12; + public static final int JAVA32_ARRAY_META = 16; + public static final int JAVA32_REF = 4; + public static final int JAVA32_OBJECT = 16; // JAVA32_META + JAVA32_REF + public static final int JAVA32_ARRAY = 20; // JAVA32_ARRAY_META + JAVA32_REF + + public static final int JAVA64_META = 24; + public static final int JAVA64_ARRAY_META = 32; + public static final int JAVA64_REF = 8; + public static final int JAVA64_OBJECT = 32; // JAVA64_META + JAVA64_REF + public static final int JAVA64_ARRAY = 40; // JAVA64_ARRAY_META + JAVA64_REF + + public static final int PRIMITIVES1 = 4; // void, boolean, byte, short, int, float + public static final int PRIMITIVES2 = 8; // long, double + + public static final int PRIMITIVE_BYTE = 1; // byte + + private static JavaDataModel current; + + public static JavaDataModel get() { + if (current != null) { + return current; + } + try { + String props = System.getProperty("sun.arch.data.model"); + if ("32".equals(props)) { + return current = JAVA32; + } + } catch (Exception e) { + // ignore + } + // TODO: separate model is needed for compressedOops, which can be guessed from memory size. + return current = JAVA64; + } + + public static int round(int size) { + JavaDataModel model = get(); + if (model == JAVA32 || size % 8 == 0) { + return size; + } + return ((size + 8) >> 3) << 3; + } + + private int lengthForPrimitiveArrayOfSize(int primitiveSize, int length) { + return alignUp(array() + primitiveSize*length, memoryAlign()); + } + + public int lengthForByteArrayOfSize(int length) { + return lengthForPrimitiveArrayOfSize(PRIMITIVE_BYTE, length); + } + public int lengthForObjectArrayOfSize(int length) { + return lengthForPrimitiveArrayOfSize(ref(), length); + } + public int lengthForLongArrayOfSize(int length) { + return lengthForPrimitiveArrayOfSize(primitive2(), length); + } + public int lengthForDoubleArrayOfSize(int length) { + return lengthForPrimitiveArrayOfSize(primitive2(), length); + } + public int lengthForIntArrayOfSize(int length) { + return lengthForPrimitiveArrayOfSize(primitive1(), length); + } + public int lengthForBooleanArrayOfSize(int length) { + return lengthForPrimitiveArrayOfSize(PRIMITIVE_BYTE, length); + } + public int lengthForTimestampArrayOfSize(int length) { + return lengthForPrimitiveArrayOfSize(lengthOfTimestamp(), length); + } + public int lengthForDateArrayOfSize(int length) { + return lengthForPrimitiveArrayOfSize(lengthOfDate(), length); + } + public int lengthForDecimalArrayOfSize(int length) { + return lengthForPrimitiveArrayOfSize(lengthOfDecimal(), length); + } + + public int lengthOfDecimal() { + // object overhead + 8 bytes for intCompact + 4 bytes for precision + // + 4 bytes for scale + size of BigInteger + return object() + 2 * primitive2() + lengthOfBigInteger(); + } + + private int lengthOfBigInteger() { + // object overhead + 4 bytes for bitCount + 4 bytes for bitLength + // + 4 bytes for firstNonzeroByteNum + 4 bytes for firstNonzeroIntNum + + // + 4 bytes for lowestSetBit + 5 bytes for size of magnitude (since max precision + // is only 38 for HiveDecimal) + 7 bytes of padding (since java memory allocations + // are 8 byte aligned) + return object() + 4 * primitive2(); + } + + public int lengthOfTimestamp() { + // object overhead + 4 bytes for int (nanos) + 4 bytes of padding + return object() + primitive2(); + } + + public int lengthOfDate() { + // object overhead + 8 bytes for long (fastTime) + 16 bytes for cdate + return object() + 3 * primitive2(); + } + + public int lengthForStringOfLength(int strLen) { + return object() + primitive1() * 3 + array() + strLen; + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java b/java/storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java new file mode 100644 index 0000000..189ead5 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/ql/util/TimestampUtils.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.util; + +import org.apache.hadoop.hive.common.type.HiveDecimal; + +import java.math.BigDecimal; +import java.sql.Timestamp; + +/** + * Utitilities for Timestamps and the relevant conversions. + */ +public class TimestampUtils { + public static final BigDecimal BILLION_BIG_DECIMAL = BigDecimal.valueOf(1000000000); + + /** + * Convert the timestamp to a double measured in seconds. + * @return double representation of the timestamp, accurate to nanoseconds + */ + public static double getDouble(Timestamp ts) { + long seconds = millisToSeconds(ts.getTime()); + return seconds + ((double) ts.getNanos()) / 1000000000; + } + + public static Timestamp doubleToTimestamp(double f) { + long seconds = (long) f; + + // We must ensure the exactness of the double's fractional portion. + // 0.6 as the fraction part will be converted to 0.59999... and + // significantly reduce the savings from binary serialization + BigDecimal bd; + try { + bd = new BigDecimal(String.valueOf(f)); + } catch (NumberFormatException nfe) { + return null; + } + bd = bd.subtract(new BigDecimal(seconds)).multiply(new BigDecimal(1000000000)); + int nanos = bd.intValue(); + + // Convert to millis + long millis = seconds * 1000; + if (nanos < 0) { + millis -= 1000; + nanos += 1000000000; + } + Timestamp t = new Timestamp(millis); + + // Set remaining fractional portion to nanos + t.setNanos(nanos); + return t; + } + + public static Timestamp decimalToTimestamp(HiveDecimal d) { + BigDecimal nanoInstant = d.bigDecimalValue().multiply(BILLION_BIG_DECIMAL); + int nanos = nanoInstant.remainder(BILLION_BIG_DECIMAL).intValue(); + if (nanos < 0) { + nanos += 1000000000; + } + long seconds = + nanoInstant.subtract(new BigDecimal(nanos)).divide(BILLION_BIG_DECIMAL).longValue(); + Timestamp t = new Timestamp(seconds * 1000); + t.setNanos(nanos); + + return t; + } + + /** + * Rounds the number of milliseconds relative to the epoch down to the nearest whole number of + * seconds. 500 would round to 0, -500 would round to -1. + */ + public static long millisToSeconds(long millis) { + if (millis >= 0) { + return millis / 1000; + } else { + return (millis - 999) / 1000; + } + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/DateWritable.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/DateWritable.java b/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/DateWritable.java new file mode 100644 index 0000000..dd2b1d9 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/DateWritable.java @@ -0,0 +1,179 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.io; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.sql.Date; +import java.util.Calendar; +import java.util.TimeZone; +import java.util.concurrent.TimeUnit; + +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableUtils; + + +/** + * DateWritable + * Writable equivalent of java.sql.Date. + * + * Dates are of the format + * YYYY-MM-DD + * + */ +public class DateWritable implements WritableComparable<DateWritable> { + + private static final long MILLIS_PER_DAY = TimeUnit.DAYS.toMillis(1); + + // Local time zone. + // Java TimeZone has no mention of thread safety. Use thread local instance to be safe. + private static final ThreadLocal<TimeZone> LOCAL_TIMEZONE = new ThreadLocal<TimeZone>() { + @Override + protected TimeZone initialValue() { + return Calendar.getInstance().getTimeZone(); + } + }; + + // Internal representation is an integer representing day offset from our epoch value 1970-01-01 + private int daysSinceEpoch = 0; + + /* Constructors */ + public DateWritable() { + } + + public DateWritable(DateWritable d) { + set(d); + } + + public DateWritable(Date d) { + set(d); + } + + public DateWritable(int d) { + set(d); + } + + /** + * Set the DateWritable based on the days since epoch date. + * @param d integer value representing days since epoch date + */ + public void set(int d) { + daysSinceEpoch = d; + } + + /** + * Set the DateWritable based on the year/month/day of the date in the local timezone. + * @param d Date value + */ + public void set(Date d) { + if (d == null) { + daysSinceEpoch = 0; + return; + } + + set(dateToDays(d)); + } + + public void set(DateWritable d) { + set(d.daysSinceEpoch); + } + + /** + * + * @return Date value corresponding to the date in the local time zone + */ + public Date get() { + return new Date(daysToMillis(daysSinceEpoch)); + } + + public int getDays() { + return daysSinceEpoch; + } + + /** + * + * @return time in seconds corresponding to this DateWritable + */ + public long getTimeInSeconds() { + return get().getTime() / 1000; + } + + public static Date timeToDate(long l) { + return new Date(l * 1000); + } + + public static long daysToMillis(int d) { + // Convert from day offset to ms in UTC, then apply local timezone offset. + long millisUtc = d * MILLIS_PER_DAY; + long tmp = millisUtc - LOCAL_TIMEZONE.get().getOffset(millisUtc); + // Between millisUtc and tmp, the time zone offset may have changed due to DST. + // Look up the offset again. + return millisUtc - LOCAL_TIMEZONE.get().getOffset(tmp); + } + + public static int millisToDays(long millisLocal) { + long millisUtc = millisLocal + LOCAL_TIMEZONE.get().getOffset(millisLocal); + int days; + if (millisUtc >= 0L) { + days = (int) (millisUtc / MILLIS_PER_DAY); + } else { + days = (int) ((millisUtc - 86399999) / MILLIS_PER_DAY); + } + return days; + } + + public static int dateToDays(Date d) { + // convert to equivalent time in UTC, then get day offset + long millisLocal = d.getTime(); + return millisToDays(millisLocal); + } + + @Override + public void readFields(DataInput in) throws IOException { + daysSinceEpoch = WritableUtils.readVInt(in); + } + + @Override + public void write(DataOutput out) throws IOException { + WritableUtils.writeVInt(out, daysSinceEpoch); + } + + @Override + public int compareTo(DateWritable d) { + return daysSinceEpoch - d.daysSinceEpoch; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof DateWritable)) { + return false; + } + return compareTo((DateWritable) o) == 0; + } + + @Override + public String toString() { + return get().toString(); + } + + @Override + public int hashCode() { + return daysSinceEpoch; + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/HiveDecimalWritable.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/HiveDecimalWritable.java b/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/HiveDecimalWritable.java new file mode 100644 index 0000000..41452da --- /dev/null +++ b/java/storage-api/src/java/org/apache/hadoop/hive/serde2/io/HiveDecimalWritable.java @@ -0,0 +1,170 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.serde2.io; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.math.BigInteger; + +import org.apache.hadoop.hive.common.type.HiveDecimal; + +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableUtils; + +public class HiveDecimalWritable implements WritableComparable<HiveDecimalWritable> { + + private byte[] internalStorage = new byte[0]; + private int scale; + + public HiveDecimalWritable() { + } + + public HiveDecimalWritable(String value) { + set(HiveDecimal.create(value)); + } + + public HiveDecimalWritable(byte[] bytes, int scale) { + set(bytes, scale); + } + + public HiveDecimalWritable(HiveDecimalWritable writable) { + set(writable.getHiveDecimal()); + } + + public HiveDecimalWritable(HiveDecimal value) { + set(value); + } + + public HiveDecimalWritable(long value) { + set((HiveDecimal.create(value))); + } + + public void set(HiveDecimal value) { + set(value.unscaledValue().toByteArray(), value.scale()); + } + + public void set(HiveDecimal value, int maxPrecision, int maxScale) { + set(HiveDecimal.enforcePrecisionScale(value, maxPrecision, maxScale)); + } + + public void set(HiveDecimalWritable writable) { + set(writable.getHiveDecimal()); + } + + public void set(byte[] bytes, int scale) { + this.internalStorage = bytes; + this.scale = scale; + } + + public HiveDecimal getHiveDecimal() { + return HiveDecimal.create(new BigInteger(internalStorage), scale); + } + + /** + * Get a HiveDecimal instance from the writable and constraint it with maximum precision/scale. + * + * @param maxPrecision maximum precision + * @param maxScale maximum scale + * @return HiveDecimal instance + */ + public HiveDecimal getHiveDecimal(int maxPrecision, int maxScale) { + return HiveDecimal.enforcePrecisionScale(HiveDecimal. + create(new BigInteger(internalStorage), scale), + maxPrecision, maxScale); + } + + @Override + public void readFields(DataInput in) throws IOException { + scale = WritableUtils.readVInt(in); + int byteArrayLen = WritableUtils.readVInt(in); + if (internalStorage.length != byteArrayLen) { + internalStorage = new byte[byteArrayLen]; + } + in.readFully(internalStorage); + } + + @Override + public void write(DataOutput out) throws IOException { + WritableUtils.writeVInt(out, scale); + WritableUtils.writeVInt(out, internalStorage.length); + out.write(internalStorage); + } + + @Override + public int compareTo(HiveDecimalWritable that) { + return getHiveDecimal().compareTo(that.getHiveDecimal()); + } + + @Override + public String toString() { + return getHiveDecimal().toString(); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + HiveDecimalWritable bdw = (HiveDecimalWritable) other; + + // 'equals' and 'compareTo' are not compatible with HiveDecimals. We want + // compareTo which returns true iff the numbers are equal (e.g.: 3.14 is + // the same as 3.140). 'Equals' returns true iff equal and the same scale + // is set in the decimals (e.g.: 3.14 is not the same as 3.140) + return getHiveDecimal().compareTo(bdw.getHiveDecimal()) == 0; + } + + @Override + public int hashCode() { + return getHiveDecimal().hashCode(); + } + + /* (non-Javadoc) + * In order to update a Decimal128 fast (w/o allocation) we need to expose access to the + * internal storage bytes and scale. + * @return + */ + public byte[] getInternalStorage() { + return internalStorage; + } + + /* (non-Javadoc) + * In order to update a Decimal128 fast (w/o allocation) we need to expose access to the + * internal storage bytes and scale. + */ + public int getScale() { + return scale; + } + + public static + HiveDecimalWritable enforcePrecisionScale(HiveDecimalWritable writable, + int precision, int scale) { + if (writable == null) { + return null; + } + + HiveDecimal dec = + HiveDecimal.enforcePrecisionScale(writable.getHiveDecimal(), precision, + scale); + return dec == null ? null : new HiveDecimalWritable(dec); + } +}
