ericphanson commented on issue #485:
URL: https://github.com/apache/arrow-julia/issues/485#issuecomment-4099596822

   ```julia
   
   using Arrow
   using Tables
   
   # Write an Arrow file with table metadata and column metadata.
   # The same schema is written in multiple record batches when nbatches > 1.
   function write_issue485_file(path; nbatches::Int)
       rm(path; force=true)
   
       filemeta = ["file" => path, "note" => "issue #485 repro"]
       colmeta1 = ["id" => "chan1", "comment" => "Column1 metadata"]
       colmeta2 = ["id" => "chan2", "comment" => "Column2 metadata"]
   
       writer = open(
           Arrow.Writer,
           path;
           metadata=filemeta,
           colmetadata=Dict(:Column1 => colmeta1, :Column2 => colmeta2),
       )
   
       try
           for i in 1:nbatches
               # Use the same write path as the original issue:
               # a matrix converted with Tables.table(...)
               data = rand(2, 2) .+ i
               Arrow.write(writer, Tables.table(data))
           end
       finally
           close(writer)
       end
   end
   
   # Read the file back and print metadata at both levels:
   #   1) the top-level combined Arrow.Table column
   #   2) each partition / record batch separately
   function inspect_file(path)
       tbl = Arrow.Table(path)
   
       println("file: ", path)
       println("typeof(tbl.Column1): ", typeof(tbl.Column1))
       println("table metadata: ", Arrow.getmetadata(tbl))
       println("top-level Column1 metadata: ", Arrow.getmetadata(tbl.Column1))
       println("top-level Column2 metadata: ", Arrow.getmetadata(tbl.Column2))
   
       # Partition-local columns keep their metadata even in the failing case.
       println("partition-local Column1 metadata:")
       for (i, part) in enumerate(Tables.partitions(tbl))
           println("  partition ", i, ": ", Arrow.getmetadata(part.Column1))
       end
       println()
   end
   ```
   
   gives
   
   ```julia
   julia> write_issue485_file("issue485_one.arrow"; nbatches=1)
   
   julia> inspect_file("issue485_one.arrow")
   file: issue485_one.arrow
   typeof(tbl.Column1): Arrow.Primitive{Float64, Vector{Float64}}
   table metadata: Base.ImmutableDict("note" => "issue #485 repro", "file" => 
"issue485_one.arrow")
   top-level Column1 metadata: Base.ImmutableDict("comment" => "Column1 
metadata", "id" => "chan1")
   top-level Column2 metadata: Base.ImmutableDict("comment" => "Column2 
metadata", "id" => "chan2")
   partition-local Column1 metadata:
     partition 1: Base.ImmutableDict("comment" => "Column1 metadata", "id" => 
"chan1")
   ```
   
   so things work as expected with top-level column metadata when we have 1 
batch, but with 2:
   
   ```julia
   julia> write_issue485_file("issue485_two.arrow"; nbatches=2)
   
   julia> inspect_file("issue485_two.arrow")
   file: issue485_two.arrow
   typeof(tbl.Column1): SentinelArrays.ChainedVector{Float64, 
Arrow.Primitive{Float64, Vector{Float64}}}
   table metadata: Base.ImmutableDict("note" => "issue #485 repro", "file" => 
"issue485_two.arrow")
   top-level Column1 metadata: nothing
   top-level Column2 metadata: nothing
   partition-local Column1 metadata:
     partition 1: Base.ImmutableDict("comment" => "Column1 metadata", "id" => 
"chan1")
     partition 2: Base.ImmutableDict("comment" => "Column1 metadata", "id" => 
"chan1")
   ```
   
   now the top-level column metadata is nothing, but the individual partitions 
still have the metadata.
   
   I think there is a missing `getmetadata` method, e.g. this works:
   
   
   ```julia
   julia> using Arrow, SentinelArrays
   
   julia> function Arrow.getmetadata(x::SentinelArrays.ChainedVector{T,A}) 
where {T,A<:Arrow.ArrowVector}
              isempty(x.arrays) && return nothing
   
              m0 = Arrow.getmetadata(first(x.arrays))
              for a in Iterators.drop(x.arrays, 1)
                  m = Arrow.getmetadata(a)
                  if m != m0
                      throw(ArgumentError("inconsistent column metadata across 
ChainedVector chunks"))
                  end
              end
              return m0
          end
   getmetadata (generic function with 5 methods)
   
   julia> inspect_file("issue485_two.arrow")
   file: issue485_two.arrow
   typeof(tbl.Column1): ChainedVector{Float64, Arrow.Primitive{Float64, 
Vector{Float64}}}
   table metadata: Base.ImmutableDict("note" => "issue #485 repro", "file" => 
"issue485_two.arrow")
   top-level Column1 metadata: Base.ImmutableDict("comment" => "Column1 
metadata", "id" => "chan1")
   top-level Column2 metadata: Base.ImmutableDict("comment" => "Column2 
metadata", "id" => "chan2")
   partition-local Column1 metadata:
     partition 1: Base.ImmutableDict("comment" => "Column1 metadata", "id" => 
"chan1")
     partition 2: Base.ImmutableDict("comment" => "Column1 metadata", "id" => 
"chan1")
   ```
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to