This is an automated email from the ASF dual-hosted git repository.

luochen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git


The following commit(s) were added to refs/heads/master by this push:
     new 3f75e5b  [ASTERIXDB-2783] Fix hash collision for hash join/groupby
3f75e5b is described below

commit 3f75e5b627493eae80c89f8eaa0f075fb31ab3af
Author: luochen <[email protected]>
AuthorDate: Sat Sep 26 13:19:38 2020 -0700

    [ASTERIXDB-2783] Fix hash collision for hash join/groupby
    
    - user model changes: no
    - storage format changes: no
    - interface changes: no
    
    Details:
    - Use a random seed for hash join/groupby to avoid hash collisions
    with the hash partitioning
    - Slightly increase the join memory so that the large object join
    test case can still pass.
    
    Change-Id: If2aa02384129293e80015efc3d1f60b57f98909c
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/8123
    Integration-Tests: Jenkins <[email protected]>
    Tested-by: Jenkins <[email protected]>
    Reviewed-by: Dmitry Lychagin <[email protected]>
---
 asterixdb/asterix-app/src/main/resources/cc.conf              |  2 +-
 asterixdb/asterix-app/src/main/resources/cc3.conf             |  2 +-
 asterixdb/asterix-app/src/test/resources/cc-compression.conf  |  2 +-
 asterixdb/asterix-app/src/test/resources/cc-ssl.conf          |  2 +-
 .../results/api/cluster_state_1/cluster_state_1.1.regexadm    |  2 +-
 .../api/cluster_state_1_less/cluster_state_1_less.1.regexadm  |  2 +-
 .../external/ExternalGroupBuildOperatorNodePushable.java      |  7 ++++++-
 .../std/join/OptimizedHybridHashJoinOperatorDescriptor.java   | 11 +++++++++--
 8 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/asterixdb/asterix-app/src/main/resources/cc.conf 
b/asterixdb/asterix-app/src/main/resources/cc.conf
index ccd35f8..d5da6d4 100644
--- a/asterixdb/asterix-app/src/main/resources/cc.conf
+++ b/asterixdb/asterix-app/src/main/resources/cc.conf
@@ -55,7 +55,7 @@ log.level = INFO
 compiler.framesize=32KB
 compiler.sortmemory=320KB
 compiler.groupmemory=160KB
-compiler.joinmemory=256KB
+compiler.joinmemory=768KB
 compiler.textsearchmemory=160KB
 compiler.windowmemory=192KB
 compiler.sort.parallel=false
diff --git a/asterixdb/asterix-app/src/main/resources/cc3.conf 
b/asterixdb/asterix-app/src/main/resources/cc3.conf
index 88362aa..d2a8556 100644
--- a/asterixdb/asterix-app/src/main/resources/cc3.conf
+++ b/asterixdb/asterix-app/src/main/resources/cc3.conf
@@ -51,7 +51,7 @@ log.level = WARN
 compiler.framesize=32KB
 compiler.sortmemory=320KB
 compiler.groupmemory=160KB
-compiler.joinmemory=256KB
+compiler.joinmemory=1024KB
 compiler.textsearchmemory=160KB
 compiler.windowmemory=192KB
 compiler.parallelism=3
diff --git a/asterixdb/asterix-app/src/test/resources/cc-compression.conf 
b/asterixdb/asterix-app/src/test/resources/cc-compression.conf
index c8d9780..a3047a0 100644
--- a/asterixdb/asterix-app/src/test/resources/cc-compression.conf
+++ b/asterixdb/asterix-app/src/test/resources/cc-compression.conf
@@ -50,7 +50,7 @@ log.level = INFO
 compiler.framesize=32KB
 compiler.sortmemory=320KB
 compiler.groupmemory=160KB
-compiler.joinmemory=256KB
+compiler.joinmemory=768KB
 compiler.textsearchmemory=160KB
 compiler.windowmemory=192KB
 messaging.frame.size=4096
diff --git a/asterixdb/asterix-app/src/test/resources/cc-ssl.conf 
b/asterixdb/asterix-app/src/test/resources/cc-ssl.conf
index 499e9fc..1c0a68f 100644
--- a/asterixdb/asterix-app/src/test/resources/cc-ssl.conf
+++ b/asterixdb/asterix-app/src/test/resources/cc-ssl.conf
@@ -62,7 +62,7 @@ log.level = INFO
 compiler.framesize=32KB
 compiler.sortmemory=320KB
 compiler.groupmemory=160KB
-compiler.joinmemory=256KB
+compiler.joinmemory=768KB
 compiler.textsearchmemory=160KB
 compiler.windowmemory=192KB
 messaging.frame.size=4096
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm
 
b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm
index e377fd1..cc96921 100644
--- 
a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm
@@ -12,7 +12,7 @@
     "compiler\.groupmemory" : 163840,
     "compiler\.indexonly" : true,
     "compiler\.internal\.sanitycheck" : true,
-    "compiler\.joinmemory" : 262144,
+    "compiler\.joinmemory" : 786432,
     "compiler\.parallelism" : 0,
     "compiler\.sort\.parallel" : false,
     "compiler\.sort\.samples" : 100,
diff --git 
a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm
 
b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm
index e51d12f..68d3079 100644
--- 
a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm
+++ 
b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm
@@ -12,7 +12,7 @@
     "compiler\.groupmemory" : 163840,
     "compiler\.indexonly" : true,
     "compiler\.internal\.sanitycheck" : false,
-    "compiler\.joinmemory" : 262144,
+    "compiler\.joinmemory" : 1048576,
     "compiler\.parallelism" : 3,
     "compiler\.sort\.parallel" : true,
     "compiler\.sort\.samples" : 100,
diff --git 
a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupBuildOperatorNodePushable.java
 
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupBuildOperatorNodePushable.java
index 43f57af..20d223e 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupBuildOperatorNodePushable.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupBuildOperatorNodePushable.java
@@ -38,6 +38,11 @@ import org.apache.logging.log4j.Logger;
 
 public class ExternalGroupBuildOperatorNodePushable extends 
AbstractUnaryInputSinkOperatorNodePushable
         implements IRunFileWriterGenerator {
+    /**
+     * Use a random seed to avoid hash collision with the hash exchange 
operator.
+     * See https://issues.apache.org/jira/browse/ASTERIXDB-2783 for more 
details.
+     */
+    private static final int INIT_SEED = 573275022;
 
     private static final Logger LOGGER = LogManager.getLogger();
     private final IHyracksTaskContext ctx;
@@ -85,7 +90,7 @@ public class ExternalGroupBuildOperatorNodePushable extends 
AbstractUnaryInputSi
         state = new ExternalGroupState(ctx.getJobletContext().getJobId(), 
stateId);
         ISpillableTable table = spillableTableFactory.buildSpillableTable(ctx, 
tableSize, fileSize, keyFields,
                 comparators, firstNormalizerComputer, aggregatorFactory, 
inRecordDescriptor, outRecordDescriptor,
-                framesLimit, 0);
+                framesLimit, INIT_SEED);
         RunFileWriter[] runFileWriters = new 
RunFileWriter[table.getNumPartitions()];
         this.externalGroupBy = new ExternalHashGroupBy(this, table, 
runFileWriters, inRecordDescriptor);
 
diff --git 
a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
 
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
index 97f9c24..c142113 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
@@ -107,6 +107,12 @@ import org.apache.logging.log4j.Logger;
  */
 
 public class OptimizedHybridHashJoinOperatorDescriptor extends 
AbstractOperatorDescriptor {
+    /**
+     * Use a random seed to avoid hash collision with the hash exchange 
operator.
+     * See https://issues.apache.org/jira/browse/ASTERIXDB-2783 for more 
details.
+     */
+    private static final int INIT_SEED = 982028031;
+
     private static final int BUILD_AND_PARTITION_ACTIVITY_ID = 0;
     private static final int PARTITION_AND_JOIN_ACTIVITY_ID = 1;
 
@@ -269,10 +275,11 @@ public class OptimizedHybridHashJoinOperatorDescriptor 
extends AbstractOperatorD
                         ctx.getJobletContext().getJobId(), new 
TaskId(getActivityId(), partition));
 
                 ITuplePartitionComputer probeHpc =
-                        new FieldHashPartitionComputerFamily(probeKeys, 
propHashFunctionFactories).createPartitioner(0);
+                        new FieldHashPartitionComputerFamily(probeKeys, 
propHashFunctionFactories)
+                                .createPartitioner(INIT_SEED);
                 ITuplePartitionComputer buildHpc =
                         new FieldHashPartitionComputerFamily(buildKeys, 
buildHashFunctionFactories)
-                                .createPartitioner(0);
+                                .createPartitioner(INIT_SEED);
                 boolean failed = false;
 
                 @Override

Reply via email to