alamb commented on code in PR #9623:
URL: https://github.com/apache/arrow-rs/pull/9623#discussion_r3041518519


##########
arrow-ipc/src/reader.rs:
##########
@@ -3248,4 +3251,81 @@ mod tests {
         let reader = StreamReader::try_new(Cursor::new(buf), None);
         assert!(reader.is_err());
     }
+
+    /// Per the IPC specification, dictionary batches may be omitted for
+    /// dictionary-encoded columns where all values are null.  The C++
+    /// implementation relies on this and does not emit a dictionary batch
+    /// in that case.  Verify that the Rust reader handles such streams
+    /// by synthesizing an empty dictionary instead of returning an error.
+    #[test]
+    fn test_read_null_dict_without_dictionary_batch() {
+        // Build an all-null dictionary-encoded column.
+        let keys = Int32Array::new_null(4);
+        let values: ArrayRef = new_empty_array(&DataType::Utf8);
+        let dict_array = DictionaryArray::new(keys, values);
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "d",
+            dict_array.data_type().clone(),
+            true,
+        )]));
+        let batch = RecordBatch::try_new(schema.clone(), 
vec![Arc::new(dict_array)]).unwrap();
+
+        // Write a normal IPC stream (which includes the dictionary batch).
+        let full_stream = write_stream(&batch);
+
+        // Parse the stream into individual messages and reconstruct it
+        // without the DictionaryBatch message, simulating what C++ emits
+        // for an all-null dictionary column.
+        let mut stripped = Vec::new();
+        let mut cursor = Cursor::new(&full_stream);
+        loop {
+            // Each message is: [continuation (4 bytes)] [meta_len (4 bytes)]
+            //                   [metadata (meta_len bytes)] [body (bodyLength 
bytes)]
+            let mut header = [0u8; 4];
+            if std::io::Read::read_exact(&mut cursor, &mut header).is_err() {

Review Comment:
   I took the liberty of pushing a commit to simplify it



##########
arrow-ipc/src/reader.rs:
##########
@@ -3248,4 +3251,81 @@ mod tests {
         let reader = StreamReader::try_new(Cursor::new(buf), None);
         assert!(reader.is_err());
     }
+
+    /// Per the IPC specification, dictionary batches may be omitted for
+    /// dictionary-encoded columns where all values are null.  The C++
+    /// implementation relies on this and does not emit a dictionary batch
+    /// in that case.  Verify that the Rust reader handles such streams
+    /// by synthesizing an empty dictionary instead of returning an error.
+    #[test]
+    fn test_read_null_dict_without_dictionary_batch() {
+        // Build an all-null dictionary-encoded column.
+        let keys = Int32Array::new_null(4);
+        let values: ArrayRef = new_empty_array(&DataType::Utf8);
+        let dict_array = DictionaryArray::new(keys, values);
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "d",
+            dict_array.data_type().clone(),
+            true,
+        )]));
+        let batch = RecordBatch::try_new(schema.clone(), 
vec![Arc::new(dict_array)]).unwrap();
+
+        // Write a normal IPC stream (which includes the dictionary batch).
+        let full_stream = write_stream(&batch);
+
+        // Parse the stream into individual messages and reconstruct it
+        // without the DictionaryBatch message, simulating what C++ emits
+        // for an all-null dictionary column.
+        let mut stripped = Vec::new();
+        let mut cursor = Cursor::new(&full_stream);
+        loop {
+            // Each message is: [continuation (4 bytes)] [meta_len (4 bytes)]
+            //                   [metadata (meta_len bytes)] [body (bodyLength 
bytes)]
+            let mut header = [0u8; 4];
+            if std::io::Read::read_exact(&mut cursor, &mut header).is_err() {

Review Comment:
   this is a strange way to write read_exact -- most other times I see it like
   
   ```rust
    cursor.read_exact(&mut header).is_err() {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to