JayjeetAtGithub commented on code in PR #4736:
URL: https://github.com/apache/arrow-rs/pull/4736#discussion_r1306864420


##########
arrow-row/src/lib.rs:
##########
@@ -1445,6 +1446,70 @@ unsafe fn decode_column(
     Ok(array)
 }
 
+macro_rules! downcast_dict {
+    ($array:ident, $key:ident) => {{
+        $array
+            .as_any()
+            .downcast_ref::<DictionaryArray<$key>>()
+            .unwrap()
+    }};
+}
+
+const LOW_CARDINALITY_THRESHOLD: usize = 10;
+
+#[derive(Debug)]
+pub struct CardinalityAwareRowConverter {
+    inner: RowConverter,
+    done: bool,
+}
+
+impl CardinalityAwareRowConverter {
+    pub fn new(fields: Vec<SortField>) -> Result<Self, ArrowError> {
+        Ok(Self {
+            inner: RowConverter::new(fields)?,
+            done: false,
+        })
+    }
+
+    pub fn size(&self) -> usize {
+        self.inner.size()
+    }
+
+    pub fn convert_rows(&self, rows: &Rows) -> Result<Vec<ArrayRef>, 
ArrowError> {
+        self.inner.convert_rows(rows)
+    }
+
+    pub fn convert_columns(
+        &mut self,
+        columns: &[ArrayRef]) -> Result<Rows, ArrowError> {
+        if !self.done {
+            for (i, col) in columns.iter().enumerate() {
+                if let DataType::Dictionary(k, _) = col.data_type() {
+                    // let cardinality = 
col.as_any().downcast_ref::<DictionaryArray<Int32Type>>().unwrap().values().len();
+                    let cardinality = match k.as_ref() {
+                        DataType::Int8 => downcast_dict!(col, 
Int32Type).values().len(),
+                        DataType::Int16 => downcast_dict!(col, 
Int32Type).values().len(),
+                        DataType::Int32 => downcast_dict!(col, 
Int32Type).values().len(),
+                        DataType::Int64 => downcast_dict!(col, 
Int64Type).values().len(),
+                        DataType::UInt16 => downcast_dict!(col, 
UInt16Type).values().len(),
+                        DataType::UInt32 => downcast_dict!(col, 
UInt32Type).values().len(),
+                        DataType::UInt64 => downcast_dict!(col, 
UInt64Type).values().len(),
+                        _ => unreachable!(),
+                    };
+
+                    if cardinality >= LOW_CARDINALITY_THRESHOLD {

Review Comment:
   @alamb I am sorry, I didn't quite get it. I thought what I was doing was, 
tapping into the first batch of the stream, looking into it, setting the right 
codec (whether to use the interner or not) to encode the batches, and then let 
the conversion going for all the batches (including the first one). 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to