This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 30cc674  Add tests for hash collisions (#842)
30cc674 is described below

commit 30cc6746ec9b7f5b181c589f6687fa2ebfaa5d18
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Aug 9 11:37:43 2021 -0400

    Add tests for hash collisions (#842)
    
    * Add force_hash_collisions feature flag
    
    * Add CI job for hash collisions
    
    * Disable tests that check values of hashes
    
    * Disable failing join tests when force_hash_collisions is enabled
---
 .github/workflows/rust.yml                  | 48 +++++++++++++++++++++++++++++
 datafusion/Cargo.toml                       |  3 ++
 datafusion/src/physical_plan/hash_join.rs   |  8 +++++
 datafusion/src/physical_plan/hash_utils.rs  | 23 +++++++++++++-
 datafusion/src/physical_plan/repartition.rs |  3 ++
 datafusion/tests/sql.rs                     |  6 ++++
 6 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 89c56b5..5246db6 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -312,6 +312,54 @@ jobs:
           # Ignore MIRI errors until we can get a clean run
           cargo miri test || true
 
+
+  # Check answers are correct when hash values collide
+  hash-collisions:
+    name: Test Hash Collisions on AMD64 Rust ${{ matrix.rust }}
+    needs: [linux-build-lib]
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        arch: [amd64]
+        rust: [stable]
+    container:
+      image: ${{ matrix.arch }}/rust
+      env:
+        # Disable full debug symbol generation to speed up CI build and keep 
memory down
+        # "1" means line tables only, which is useful for panic tracebacks.
+        RUSTFLAGS: "-C debuginfo=1"
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          submodules: true
+      - name: Cache Cargo
+        uses: actions/cache@v2
+        with:
+          path: /github/home/.cargo
+          # this key equals the ones on `linux-build-lib` for re-use
+          key: cargo-cache-
+      - name: Cache Rust dependencies
+        uses: actions/cache@v2
+        with:
+          path: /github/home/target
+          # this key equals the ones on `linux-build-lib` for re-use
+          key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ 
matrix.rust }}
+      - name: Setup Rust toolchain
+        run: |
+          rustup toolchain install ${{ matrix.rust }}
+          rustup default ${{ matrix.rust }}
+          rustup component add rustfmt
+      - name: Run tests
+        run: |
+          export ARROW_TEST_DATA=$(pwd)/testing/data
+          export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data
+          cd datafusion
+          # Force all hash values to collide
+          cargo test --features=force_hash_collisions
+        env:
+          CARGO_HOME: "/github/home/.cargo"
+          CARGO_TARGET_DIR: "/github/home/target"
+
 # Coverage job was failing. 
https://github.com/apache/arrow-datafusion/issues/590 tracks re-instating it
 
   # coverage:
diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml
index bfb3a93..9b094ac 100644
--- a/datafusion/Cargo.toml
+++ b/datafusion/Cargo.toml
@@ -42,6 +42,9 @@ simd = ["arrow/simd"]
 crypto_expressions = ["md-5", "sha2"]
 regex_expressions = ["regex", "lazy_static"]
 unicode_expressions = ["unicode-segmentation"]
+# Used for testing ONLY: causes all values to hash to the same value (test for 
collisions)
+force_hash_collisions = []
+
 
 [dependencies]
 ahash = "0.7"
diff --git a/datafusion/src/physical_plan/hash_join.rs 
b/datafusion/src/physical_plan/hash_join.rs
index 1a57c40..fa75437 100644
--- a/datafusion/src/physical_plan/hash_join.rs
+++ b/datafusion/src/physical_plan/hash_join.rs
@@ -1372,6 +1372,8 @@ mod tests {
     }
 
     #[tokio::test]
+    // Disable until https://github.com/apache/arrow-datafusion/issues/843 
fixed
+    #[cfg(not(feature = "force_hash_collisions"))]
     async fn join_full_multi_batch() {
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
@@ -1637,6 +1639,8 @@ mod tests {
     }
 
     #[tokio::test]
+    // Disable until https://github.com/apache/arrow-datafusion/issues/843 
fixed
+    #[cfg(not(feature = "force_hash_collisions"))]
     async fn join_right_one() -> Result<()> {
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
@@ -1673,6 +1677,8 @@ mod tests {
     }
 
     #[tokio::test]
+    // Disable until https://github.com/apache/arrow-datafusion/issues/843 
fixed
+    #[cfg(not(feature = "force_hash_collisions"))]
     async fn partitioned_join_right_one() -> Result<()> {
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
@@ -1710,6 +1716,8 @@ mod tests {
     }
 
     #[tokio::test]
+    // Disable until https://github.com/apache/arrow-datafusion/issues/843 
fixed
+    #[cfg(not(feature = "force_hash_collisions"))]
     async fn join_full_one() -> Result<()> {
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
diff --git a/datafusion/src/physical_plan/hash_utils.rs 
b/datafusion/src/physical_plan/hash_utils.rs
index abfa09a..6a622df 100644
--- a/datafusion/src/physical_plan/hash_utils.rs
+++ b/datafusion/src/physical_plan/hash_utils.rs
@@ -298,11 +298,28 @@ fn create_hashes_dictionary<K: ArrowDictionaryKeyType>(
     Ok(())
 }
 
+/// Test version of `create_hashes` that produces the same value for
+/// all hashes (to test collisions)
+///
+/// See comments on `hashes_buffer` for more details
+#[cfg(feature = "force_hash_collisions")]
+pub fn create_hashes<'a>(
+    _arrays: &[ArrayRef],
+    _random_state: &RandomState,
+    hashes_buffer: &'a mut Vec<u64>,
+) -> Result<&'a mut Vec<u64>> {
+    for hash in hashes_buffer.iter_mut() {
+        *hash = 0
+    }
+    return Ok(hashes_buffer);
+}
+
 /// Creates hash values for every row, based on the values in the
 /// columns.
 ///
 /// The number of rows to hash is determined by `hashes_buffer.len()`.
 /// `hashes_buffer` should be pre-sized appropriately
+#[cfg(not(feature = "force_hash_collisions"))]
 pub fn create_hashes<'a>(
     arrays: &[ArrayRef],
     random_state: &RandomState,
@@ -661,6 +678,8 @@ mod tests {
     }
 
     #[test]
+    // Tests actual values of hashes, which are different if forcing collisions
+    #[cfg(not(feature = "force_hash_collisions"))]
     fn create_hashes_for_dict_arrays() {
         let strings = vec![Some("foo"), None, Some("bar"), Some("foo"), None];
 
@@ -697,12 +716,14 @@ mod tests {
         assert_eq!(strings[0], strings[3]);
         assert_eq!(dict_hashes[0], dict_hashes[3]);
 
-        // different strings should matp to different hash values
+        // different strings should map to different hash values
         assert_ne!(strings[0], strings[2]);
         assert_ne!(dict_hashes[0], dict_hashes[2]);
     }
 
     #[test]
+    // Tests actual values of hashes, which are different if forcing collisions
+    #[cfg(not(feature = "force_hash_collisions"))]
     fn create_multi_column_hash_for_dict_arrays() {
         let strings1 = vec![Some("foo"), None, Some("bar")];
         let strings2 = vec![Some("blarg"), Some("blah"), None];
diff --git a/datafusion/src/physical_plan/repartition.rs 
b/datafusion/src/physical_plan/repartition.rs
index b59071a..eb3fe55 100644
--- a/datafusion/src/physical_plan/repartition.rs
+++ b/datafusion/src/physical_plan/repartition.rs
@@ -732,6 +732,9 @@ mod tests {
     }
 
     #[tokio::test]
+    // skip this test when hash function is different because the hard
+    // coded expected output is a function of the hash values
+    #[cfg(not(feature = "force_hash_collisions"))]
     async fn repartition_with_dropping_output_stream() {
         #[derive(Debug)]
         struct Case<'a> {
diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs
index 0c33bd4..046e4f2 100644
--- a/datafusion/tests/sql.rs
+++ b/datafusion/tests/sql.rs
@@ -1797,6 +1797,8 @@ async fn equijoin_left_and_condition_from_right() -> 
Result<()> {
 }
 
 #[tokio::test]
+// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
+#[cfg(not(feature = "force_hash_collisions"))]
 async fn equijoin_right_and_condition_from_left() -> Result<()> {
     let mut ctx = create_join_context("t1_id", "t2_id")?;
     let sql =
@@ -1850,6 +1852,8 @@ async fn left_join() -> Result<()> {
 }
 
 #[tokio::test]
+// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
+#[cfg(not(feature = "force_hash_collisions"))]
 async fn right_join() -> Result<()> {
     let mut ctx = create_join_context("t1_id", "t2_id")?;
     let equivalent_sql = [
@@ -1870,6 +1874,8 @@ async fn right_join() -> Result<()> {
 }
 
 #[tokio::test]
+// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
+#[cfg(not(feature = "force_hash_collisions"))]
 async fn full_join() -> Result<()> {
     let mut ctx = create_join_context("t1_id", "t2_id")?;
     let equivalent_sql = [

Reply via email to