blaginin opened a new issue, #16228:
URL: https://github.com/apache/datafusion/issues/16228

   ### Describe the bug
   
   `Array::is_null` for dictionary arrays doesn't account for nulls in the 
values array. This causes hard-to-debug issues
   
   ### To Reproduce
   
   ```rust
   use arrow::array::{ArrayRef, DictionaryArray, Int32Array, RecordBatch, 
StringArray};
   use arrow::datatypes::{DataType, Field, Schema};
   use datafusion::catalog::MemTable;
   use datafusion::prelude::SessionContext;
   use datafusion_common::ScalarValue;
   use datafusion_execution::config::SessionConfig;
   use std::sync::Arc;
   
   #[tokio::main]
   async fn main() -> anyhow::Result<()> {
   
       let n: usize = 5;
       let num = Arc::new(Int32Array::from((0..n as _).collect::<Vec<i32>>())) 
as ArrayRef;
   
       let dict_values = StringArray::from(vec![None, Some("abc")]);
       let dict_indices = Int32Array::from(vec![0; n]); 
       // all idx point to 0 - which means that all values in the dictionary 
are None
       let dict = DictionaryArray::new(dict_indices, Arc::new(dict_values) as 
ArrayRef);
   
       let schema = Arc::new(Schema::new(vec![
           Field::new("num1", DataType::Int32, false),
           Field::new("num2", DataType::Int32, false), // num2 to disable 
SingleDistinctToGroupBy optimisation
           Field::new(
               "dict",
               DataType::Dictionary(Box::new(DataType::Int32), 
Box::new(DataType::Utf8)),
               true,
           ),
       ]));
   
       let batch = RecordBatch::try_new(
           schema.clone(),
           vec![num.clone(), num.clone(), Arc::new(dict)],
       )?;
   
       let provider = MemTable::try_new(schema, vec![vec![batch]])?;
       let mut session_config = SessionConfig::default();
   
   
       session_config = session_config.set(
           "datafusion.execution.target_partitions",
           &ScalarValue::UInt64(Some(1u64)), // won't work with more than 1 
partition
       );
   
       let ctx = SessionContext::new_with_config(session_config);
       ctx.register_table("t", Arc::new(provider))?;
   
       let df = ctx.sql("select count(distinct dict), count(num2) num1 from t 
group by num1").await?;
       // count(distinct ...) doesn't count None values, so the result should 
be 0
       df.show().await?;
   
       Ok(())
   }
   ```
   
   This will produce 
   ```sql
   +------------------------+------+
   | count(DISTINCT t.dict) | num1 |
   +------------------------+------+
   | 1                      | 1    |
   | 1                      | 1    |
   | 1                      | 1    |
   | 1                      | 1    |
   | 1                      | 1    |
   +------------------------+------+
   ```
   
   `dict` only has null values so the correct result should be 0
   If you change target partitions to `2` or remove `num2` the results will be 
`0` for all columns which is correct
   
   ### Expected behavior
   
   We should either disallow nulls in dictionary values or handle them 
explicitly (the latter is preferable IMO).
   
   ### Additional context
   
   The fix will likely be in the Arrow repo, but created this issue for 
discussion and visibility.
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to