This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 30cc674 Add tests for hash collisions (#842)
30cc674 is described below
commit 30cc6746ec9b7f5b181c589f6687fa2ebfaa5d18
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Aug 9 11:37:43 2021 -0400
Add tests for hash collisions (#842)
* Add force_hash_collisions feature flag
* Add CI job for hash collisions
* Disable tests that check values of hashes
* Disable failing join tests when force_hash_collisions is enabled
---
.github/workflows/rust.yml | 48 +++++++++++++++++++++++++++++
datafusion/Cargo.toml | 3 ++
datafusion/src/physical_plan/hash_join.rs | 8 +++++
datafusion/src/physical_plan/hash_utils.rs | 23 +++++++++++++-
datafusion/src/physical_plan/repartition.rs | 3 ++
datafusion/tests/sql.rs | 6 ++++
6 files changed, 90 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 89c56b5..5246db6 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -312,6 +312,54 @@ jobs:
# Ignore MIRI errors until we can get a clean run
cargo miri test || true
+
+ # Check answers are correct when hash values collide
+ hash-collisions:
+ name: Test Hash Collisions on AMD64 Rust ${{ matrix.rust }}
+ needs: [linux-build-lib]
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ arch: [amd64]
+ rust: [stable]
+ container:
+ image: ${{ matrix.arch }}/rust
+ env:
+ # Disable full debug symbol generation to speed up CI build and keep
memory down
+ # "1" means line tables only, which is useful for panic tracebacks.
+ RUSTFLAGS: "-C debuginfo=1"
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: true
+ - name: Cache Cargo
+ uses: actions/cache@v2
+ with:
+ path: /github/home/.cargo
+ # this key equals the ones on `linux-build-lib` for re-use
+ key: cargo-cache-
+ - name: Cache Rust dependencies
+ uses: actions/cache@v2
+ with:
+ path: /github/home/target
+ # this key equals the ones on `linux-build-lib` for re-use
+ key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{
matrix.rust }}
+ - name: Setup Rust toolchain
+ run: |
+ rustup toolchain install ${{ matrix.rust }}
+ rustup default ${{ matrix.rust }}
+ rustup component add rustfmt
+ - name: Run tests
+ run: |
+ export ARROW_TEST_DATA=$(pwd)/testing/data
+ export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data
+ cd datafusion
+ # Force all hash values to collide
+ cargo test --features=force_hash_collisions
+ env:
+ CARGO_HOME: "/github/home/.cargo"
+ CARGO_TARGET_DIR: "/github/home/target"
+
# Coverage job was failing.
https://github.com/apache/arrow-datafusion/issues/590 tracks re-instating it
# coverage:
diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml
index bfb3a93..9b094ac 100644
--- a/datafusion/Cargo.toml
+++ b/datafusion/Cargo.toml
@@ -42,6 +42,9 @@ simd = ["arrow/simd"]
crypto_expressions = ["md-5", "sha2"]
regex_expressions = ["regex", "lazy_static"]
unicode_expressions = ["unicode-segmentation"]
+# Used for testing ONLY: causes all values to hash to the same value (test for
collisions)
+force_hash_collisions = []
+
[dependencies]
ahash = "0.7"
diff --git a/datafusion/src/physical_plan/hash_join.rs
b/datafusion/src/physical_plan/hash_join.rs
index 1a57c40..fa75437 100644
--- a/datafusion/src/physical_plan/hash_join.rs
+++ b/datafusion/src/physical_plan/hash_join.rs
@@ -1372,6 +1372,8 @@ mod tests {
}
#[tokio::test]
+ // Disable until https://github.com/apache/arrow-datafusion/issues/843
fixed
+ #[cfg(not(feature = "force_hash_collisions"))]
async fn join_full_multi_batch() {
let left = build_table(
("a1", &vec![1, 2, 3]),
@@ -1637,6 +1639,8 @@ mod tests {
}
#[tokio::test]
+ // Disable until https://github.com/apache/arrow-datafusion/issues/843
fixed
+ #[cfg(not(feature = "force_hash_collisions"))]
async fn join_right_one() -> Result<()> {
let left = build_table(
("a1", &vec![1, 2, 3]),
@@ -1673,6 +1677,8 @@ mod tests {
}
#[tokio::test]
+ // Disable until https://github.com/apache/arrow-datafusion/issues/843
fixed
+ #[cfg(not(feature = "force_hash_collisions"))]
async fn partitioned_join_right_one() -> Result<()> {
let left = build_table(
("a1", &vec![1, 2, 3]),
@@ -1710,6 +1716,8 @@ mod tests {
}
#[tokio::test]
+ // Disable until https://github.com/apache/arrow-datafusion/issues/843
fixed
+ #[cfg(not(feature = "force_hash_collisions"))]
async fn join_full_one() -> Result<()> {
let left = build_table(
("a1", &vec![1, 2, 3]),
diff --git a/datafusion/src/physical_plan/hash_utils.rs
b/datafusion/src/physical_plan/hash_utils.rs
index abfa09a..6a622df 100644
--- a/datafusion/src/physical_plan/hash_utils.rs
+++ b/datafusion/src/physical_plan/hash_utils.rs
@@ -298,11 +298,28 @@ fn create_hashes_dictionary<K: ArrowDictionaryKeyType>(
Ok(())
}
+/// Test version of `create_hashes` that produces the same value for
+/// all hashes (to test collisions)
+///
+/// See comments on `hashes_buffer` for more details
+#[cfg(feature = "force_hash_collisions")]
+pub fn create_hashes<'a>(
+ _arrays: &[ArrayRef],
+ _random_state: &RandomState,
+ hashes_buffer: &'a mut Vec<u64>,
+) -> Result<&'a mut Vec<u64>> {
+ for hash in hashes_buffer.iter_mut() {
+ *hash = 0
+ }
+ return Ok(hashes_buffer);
+}
+
/// Creates hash values for every row, based on the values in the
/// columns.
///
/// The number of rows to hash is determined by `hashes_buffer.len()`.
/// `hashes_buffer` should be pre-sized appropriately
+#[cfg(not(feature = "force_hash_collisions"))]
pub fn create_hashes<'a>(
arrays: &[ArrayRef],
random_state: &RandomState,
@@ -661,6 +678,8 @@ mod tests {
}
#[test]
+ // Tests actual values of hashes, which are different if forcing collisions
+ #[cfg(not(feature = "force_hash_collisions"))]
fn create_hashes_for_dict_arrays() {
let strings = vec![Some("foo"), None, Some("bar"), Some("foo"), None];
@@ -697,12 +716,14 @@ mod tests {
assert_eq!(strings[0], strings[3]);
assert_eq!(dict_hashes[0], dict_hashes[3]);
- // different strings should matp to different hash values
+ // different strings should map to different hash values
assert_ne!(strings[0], strings[2]);
assert_ne!(dict_hashes[0], dict_hashes[2]);
}
#[test]
+ // Tests actual values of hashes, which are different if forcing collisions
+ #[cfg(not(feature = "force_hash_collisions"))]
fn create_multi_column_hash_for_dict_arrays() {
let strings1 = vec![Some("foo"), None, Some("bar")];
let strings2 = vec![Some("blarg"), Some("blah"), None];
diff --git a/datafusion/src/physical_plan/repartition.rs
b/datafusion/src/physical_plan/repartition.rs
index b59071a..eb3fe55 100644
--- a/datafusion/src/physical_plan/repartition.rs
+++ b/datafusion/src/physical_plan/repartition.rs
@@ -732,6 +732,9 @@ mod tests {
}
#[tokio::test]
+ // skip this test when hash function is different because the hard
+ // coded expected output is a function of the hash values
+ #[cfg(not(feature = "force_hash_collisions"))]
async fn repartition_with_dropping_output_stream() {
#[derive(Debug)]
struct Case<'a> {
diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs
index 0c33bd4..046e4f2 100644
--- a/datafusion/tests/sql.rs
+++ b/datafusion/tests/sql.rs
@@ -1797,6 +1797,8 @@ async fn equijoin_left_and_condition_from_right() ->
Result<()> {
}
#[tokio::test]
+// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
+#[cfg(not(feature = "force_hash_collisions"))]
async fn equijoin_right_and_condition_from_left() -> Result<()> {
let mut ctx = create_join_context("t1_id", "t2_id")?;
let sql =
@@ -1850,6 +1852,8 @@ async fn left_join() -> Result<()> {
}
#[tokio::test]
+// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
+#[cfg(not(feature = "force_hash_collisions"))]
async fn right_join() -> Result<()> {
let mut ctx = create_join_context("t1_id", "t2_id")?;
let equivalent_sql = [
@@ -1870,6 +1874,8 @@ async fn right_join() -> Result<()> {
}
#[tokio::test]
+// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
+#[cfg(not(feature = "force_hash_collisions"))]
async fn full_join() -> Result<()> {
let mut ctx = create_join_context("t1_id", "t2_id")?;
let equivalent_sql = [