chuchiy opened a new issue, #637:
URL: https://github.com/apache/arrow-go/issues/637

   ### Describe the bug, including details regarding any error messages, 
version, and platform.
   
   Ref [pola-rs/polars#26126](https://github.com/pola-rs/polars/issues/26126)
   
   polars can not read arrow ipc file generated by arrow-go when ipc file is 
compressed and has column with null value. But when we use pyarrow to write 
such ipc file, polars ipc file works fine.
   
   The problem may caused by the  'uncompressed buffer length (in bytes)' 
calculation of the validity bitmap. The buffer length is  rounded up to the 
nearest 4 * byte multiple (i.e., 32-bit multiple), vs the nearest 1 * byte 
multiple (i.e, 8-bit multiple).
   
   Using the following sample go and python code. polars can read the pyarrow 
generated file but fail to the read the arrow-go generated file.
   
   Sample go code
   
   ```go
   package main
   
   import (
        "fmt"
        "os"
   
        "github.com/apache/arrow-go/v18/arrow"
        "github.com/apache/arrow-go/v18/arrow/array"
        "github.com/apache/arrow-go/v18/arrow/ipc"
        "github.com/apache/arrow-go/v18/arrow/memory"
   )
   
   func main() {
        // 1. Initialize the allocator for memory management
        pool := memory.NewGoAllocator()
   
        // 2. Define the Schema
        schema := arrow.NewSchema(
                []arrow.Field{
                        {Name: "id", Type: arrow.PrimitiveTypes.Int64, 
Nullable: true},
                        {Name: "user", Type: arrow.BinaryTypes.String, 
Nullable: true},
                },
                nil,
        )
   
        // 3. Build the Data Arrays
        b := array.NewRecordBuilder(pool, schema)
        defer b.Release()
   
        // Populate Column 1 (ID)
   
        i64b := b.Field(0).(*array.Int64Builder)
        i64b.Append(1)
        i64b.Append(2)
        i64b.AppendNull()
        i64b.Append(3)
        // Populate Column 2 (User)
        b.Field(1).(*array.StringBuilder).AppendValues([]string{"Alice", "Bob", 
"Charlie", "David"}, nil)
   
   
        // Create a Record Batch
        rec := b.NewRecordBatch()
        defer rec.Release()
   
        // 4. Create the File
        f, err := os.Create("data.arrow")
        if err != nil {
                fmt.Printf("failed to create file: %v\n", err)
                return
        }
        defer f.Close()
   
        // 5. Initialize the IPC Writer with Zstd Compression
        // We use ipc.WithZstd() to enable the compression codec.
        writer, err := ipc.NewFileWriter(f,
                ipc.WithSchema(schema),
                ipc.WithZstd(), // This triggers the Zstd compressor
                // ipc.WithMinSpaceSavings(0.2),
        )
        if err != nil {
                fmt.Printf("failed to create writer: %v\n", err)
                return
        }
        defer writer.Close()
   
        // 6. Write the record to the file
        err = writer.Write(rec)
        if err != nil {
                fmt.Printf("failed to write record: %v\n", err)
                return
        }
   
        fmt.Println("Arrow file 'data.arrow' written successfully with Zstd 
compression.")
   }
   
   ```
   
   
   Sample pyarrow code
   
   
   ```python
   
   import pyarrow as pa
   import pyarrow.ipc as ipc
   
   data = {"id": [1, 2, None, 4]}
   
   table = pa.Table.from_pydict(data)
   file_buffer = "./gen_pa.ipc"
   options = ipc.IpcWriteOptions(compression="zstd")
   with ipc.new_file(file_buffer, table.schema, options=options) as writer:
       writer.write_table(table)
   ```
   
   ``` python
   import pyarrow as pa
   import pyarrow.ipc as ipc
   
   data = {"id": [1, 2, None, 4]}
   
   table = pa.Table.from_pydict(data)
   file_buffer = "./gen_pa.ipc"
   options = ipc.IpcWriteOptions(compression="zstd")
   with ipc.new_file(file_buffer, table.schema, options=options) as writer:
       writer.write_table(table)
   ```
   
   polars read code
   
   ```python
   import polars as pl
   pl.read_ipc('data.arrow')
   ```
   
   ### Component(s)
   
   Other


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to