HIVE-12354 : MapJoin with double keys is slow on MR (Sergey Shelukhin, reviewed by Prasanth Jayachandran)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/41b60c44 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/41b60c44 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/41b60c44 Branch: refs/heads/spark Commit: 41b60c44401d92787227b5cdf2a51c20d28a2bc4 Parents: 08e9d26 Author: Sergey Shelukhin <[email protected]> Authored: Mon Nov 9 16:32:31 2015 -0800 Committer: Sergey Shelukhin <[email protected]> Committed: Mon Nov 9 16:32:31 2015 -0800 ---------------------------------------------------------------------- .../hadoop/hive/ql/exec/KeyWrapperFactory.java | 20 +-------------- .../ql/exec/persistence/MapJoinKeyObject.java | 6 ++--- .../objectinspector/ObjectInspectorUtils.java | 26 ++++++++++++++++++++ 3 files changed, 29 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/41b60c44/ql/src/java/org/apache/hadoop/hive/ql/exec/KeyWrapperFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/KeyWrapperFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/KeyWrapperFactory.java index 1c409a2..5154a5f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/KeyWrapperFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/KeyWrapperFactory.java @@ -105,25 +105,7 @@ public class KeyWrapperFactory { @Override public void setHashKey() { - if (keys == null) { - hashcode = 0; - } else { - hashcode = 1; - for (Object element : keys) { - hashcode = 31 * hashcode; - if(element != null) { - if(element instanceof LazyDouble) { - long v = Double.doubleToLongBits(((LazyDouble)element).getWritableObject().get()); - hashcode = hashcode + (int) (v ^ (v >>> 32)); - } else if (element instanceof DoubleWritable){ - long v = Double.doubleToLongBits(((DoubleWritable)element).get()); - hashcode = hashcode + (int) (v ^ (v >>> 32)); - } else { - hashcode = hashcode + element.hashCode(); - } - } - } - } + hashcode = ObjectInspectorUtils.writableArrayHashCode(keys); } @Override http://git-wip-us.apache.org/repos/asf/hive/blob/41b60c44/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinKeyObject.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinKeyObject.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinKeyObject.java index e1fd6d3..7592f9e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinKeyObject.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinKeyObject.java @@ -78,11 +78,9 @@ public class MapJoinKeyObject extends MapJoinKey { @Override public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + Arrays.hashCode(key); - return result; + return ObjectInspectorUtils.writableArrayHashCode(key); } + @Override public boolean equals(Object obj) { if (this == obj) http://git-wip-us.apache.org/repos/asf/hive/blob/41b60c44/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java ---------------------------------------------------------------------- diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java index 56597a2..7a13eb0 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java @@ -38,6 +38,7 @@ import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.lazy.LazyDouble; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions; import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveWritableObjectInspector; @@ -77,6 +78,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectIn import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.util.StringUtils; @@ -104,6 +106,30 @@ public final class ObjectInspectorUtils { } /** + * Calculates the hash code for array of Objects that contains writables. This is used + * to work around the buggy Hadoop DoubleWritable hashCode implementation. This should + * only be used for process-local hash codes; don't replace stored hash codes like bucketing. + */ + public static int writableArrayHashCode(Object[] keys) { + if (keys == null) return 0; + int hashcode = 1; + for (Object element : keys) { + hashcode = 31 * hashcode; + if (element == null) continue; + if (element instanceof LazyDouble) { + long v = Double.doubleToLongBits(((LazyDouble)element).getWritableObject().get()); + hashcode = hashcode + (int) (v ^ (v >>> 32)); + } else if (element instanceof DoubleWritable){ + long v = Double.doubleToLongBits(((DoubleWritable)element).get()); + hashcode = hashcode + (int) (v ^ (v >>> 32)); + } else { + hashcode = hashcode + element.hashCode(); + } + } + return hashcode; + } + + /** * Ensures that an ObjectInspector is Writable. */ public static ObjectInspector getWritableObjectInspector(ObjectInspector oi) {
