alamb opened a new issue #781:
URL: https://github.com/apache/arrow-datafusion/issues/781


   **Describe the bug**
   Nulls are ignored when aggregating dictionary arrays. 
   
   **To Reproduce**
   Create a dictionary array with nulls and then aggregate using that dictionary
   
   Here is an example:
   
   ```rust
   async fn dictionary_test() {
       let key_builder = PrimitiveBuilder::<Int8Type>::new(100);
       let value_builder = StringBuilder::new(100);
       let mut builder = StringDictionaryBuilder::new(key_builder, 
value_builder);
   
   
       // The builder builds the dictionary value by value
       builder.append("b").unwrap();
       builder.append_null().unwrap();
       builder.append("b").unwrap();
       builder.append("a").unwrap();
       builder.append("c").unwrap();
       builder.append_null().unwrap();
   
       let array = builder.finish();
   
       println!("The keys");
       print_primitive_array(array.keys());
   
       println!("The values");
       print_array(array.values());
   
       println!("The array:");
   
       let array = Arc::new(array) as ArrayRef;
       print_array(&array);
   
       // Now, let's run a query showing the invalid values are being used as 
well:
       let batch = RecordBatch::try_from_iter(vec![("dict", array)]).unwrap();
       let table = MemTable::try_new(batch.schema(), 
vec![vec![batch]]).unwrap();
       let table = Arc::new(table);
   
       // +------+
       // | dict |
       // +------+
       // | b    |
       // |      |
       // | b    |
       // | a    |
       // | c    |
       // |      |
       // +------+
       run_query(table.clone(), "select * from t").await;
   
       // now aggregate the query
       // (wrong results -- there are only 2 values for b, but it shows up as 4)
       // +------+-----------------+
       // | dict | COUNT(UInt8(1)) |
       // +------+-----------------+
       // | a    | 1               |
       // | c    | 1               |
       // | b    | 4               |
       // +------+-----------------+
       run_query(table.clone(), "select dict, count(*) from t group by 
dict").await;
   
   }
   
   ```
   
   
   Here is a simply reproducer in sql.rs:
   
   ```diff
   diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs
   index d9f7c6ea4..a323fc499 100644
   --- a/datafusion/tests/sql.rs
   +++ b/datafusion/tests/sql.rs
   @@ -3019,6 +3019,15 @@ async fn query_on_string_dictionary() -> Result<()> {
        let expected = vec![vec!["2"]];
        assert_eq!(expected, actual);
    
   +    // grouping
   +    let sql = "SELECT d1, COUNT(*) FROM test group by d1";
   +    let actual = execute(&mut ctx, sql).await;
   +    let expected = vec![
   +        vec!["one", "1"],
   +        vec!["three", "1"],
   +    ];
   +    assert_eq!(expected, actual);
   +
        Ok(())
    }
    ```
   
   results in 
   ```
   ---- query_on_string_dictionary stdout ----
   thread 'query_on_string_dictionary' panicked at 'assertion failed: `(left == 
right)`
     left: `[["one", "1"], ["three", "1"]]`,
    right: `[["one", "2"], ["three", "1"]]`', datafusion/tests/sql.rs:3029:5
   note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
   
   ```
   
   **Expected behavior**
   Null entries should be ignored, but instead whatever value in the keys array 
is at that position is used to find a value. 
   
   **Additional context**
   We found this in IOx (where @tustvold clevery used `-1` for entries in 
`keys` that were NULL rather than `0`): 
https://github.com/influxdata/influxdb_iox/issues/2112


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to