ForceBru opened a new issue, #590:
URL: https://github.com/apache/arrow-julia/issues/590
The `BoundsError` persists in Arrow v2.8.1.
> Here's a `BoundsError: attempt to access 0-element Vector{Vector{UInt8}}
at index [1]`:
>
> ```
> python
>>> from random import randint; col=[randint(1,500) for _ in range(100)];
print(col); import polars as pl; pl.DataFrame({'more': ['नमस्ते'*i for i in
col],'text':['k'*i for i in col]}).write_ipc("long.arrow")
> [232, 143, 235, 324, 105, 114, 47, 455, 111, 132, 125, 327, 249, 355, 317,
156, 312, 481, 107, 404, 493, 343, 41, 430, 1, 13, 107, 125, 114, 172, 443,
307, 328, 331, 318, 292, 327, 175, 41, 483, 147, 340, 309, 346, 414, 333, 103,
147, 143, 335, 132, 88, 409, 473, 45, 108, 112, 282, 150, 334, 261, 428, 316,
385, 157, 458, 348, 207, 444, 140, 425, 69, 500, 222, 472, 35, 170, 431, 11,
125, 484, 346, 187, 441, 108, 237, 18, 466, 128, 467, 466, 391, 310, 318, 171,
331, 450, 90, 194, 465]
> julia --project -e "using DataFrames; import Arrow;
Arrow.Table(\"long.arrow\") |> DataFrame |> display"
> ERROR: BoundsError: attempt to access 0-element Vector{Vector{UInt8}} at
index [1]
> Stacktrace:
> [1] throw_boundserror(A::Vector{Vector{UInt8}}, I::Tuple{Int64})
> @ Base ./essentials.jl:14
> [2] getindex
> @ ./essentials.jl:916 [inlined]
> [3] getindex(l::Arrow.View{Union{Missing, String}}, i::Int64)
> @ Arrow ~/.julia/packages/Arrow/3GbnS/src/arraytypes/views.jl:61
> [4] getindex
> @ ~/.julia/packages/DataFrames/kcA9R/src/dataframe/dataframe.jl:517
[inlined]
> [5] _pretty_tables_highlighter_func(data::DataFrame, i::Int64, j::Int64)
> @ DataFrames
~/.julia/packages/DataFrames/kcA9R/src/abstractdataframe/prettytables.jl:13
> [6] _text_process_data_cell(ptable::PrettyTables.ProcessedTable,
cell_data::PrettyTables.UndefinedCell, cell_str::String, i::Int64, j::Int64,
l::Int64, column_width::Int64, crayon::Crayons.Crayon, alignment::Symbol,
highlighters::Ref{Any})
> @ PrettyTables
~/.julia/packages/PrettyTables/oVZqx/src/backends/text/print_cell.jl:108
> [7] _text_print_table!(display::PrettyTables.Display,
ptable::PrettyTables.ProcessedTable, table_str::Matrix{Vector{String}},
actual_columns_width::Vector{Int64}, continuation_row_line::Int64,
num_lines_in_row::Vector{Int64}, num_lines_around_table::Int64,
body_hlines::Vector{Int64}, body_hlines_format::NTuple{4, Char},
continuation_row_alignment::Symbol, ellipsis_line_skip::Int64,
highlighters::Ref{Any}, hlines::Vector{Int64}, tf::PrettyTables.TextFormat,
text_crayons::PrettyTables.TextCrayons{Crayons.Crayon, Crayons.Crayon},
vlines::Vector{Int64})
> @ PrettyTables
~/.julia/packages/PrettyTables/oVZqx/src/backends/text/print_table.jl:237
> [8] _print_table_with_text_back_end(pinfo::PrettyTables.PrintInfo;
alignment_anchor_fallback::Symbol,
alignment_anchor_fallback_override::Dict{Int64, Symbol},
alignment_anchor_regex::Dict{Int64, Vector{Regex}}, autowrap::Bool,
body_hlines::Vector{Int64}, body_hlines_format::Nothing,
continuation_row_alignment::Symbol, crop::Symbol, crop_subheader::Bool,
columns_width::Int64, display_size::Tuple{Int64, Int64},
equal_columns_width::Bool, ellipsis_line_skip::Int64,
highlighters::Tuple{PrettyTables.Highlighter}, hlines::Vector{Symbol},
linebreaks::Bool, maximum_columns_width::Vector{Int64},
minimum_columns_width::Int64, newline_at_end::Bool, overwrite::Bool,
reserved_display_lines::Int64, show_omitted_cell_summary::Bool, sortkeys::Bool,
tf::PrettyTables.TextFormat, title_autowrap::Bool,
title_same_width_as_table::Bool, vcrop_mode::Symbol, vlines::Vector{Int64},
border_crayon::Crayons.Crayon, header_crayon::Crayons.Crayon,
omitted_cell_summary_crayon::Crayons.Crayon, row_label_cray
on::Crayons.Crayon, row_label_header_crayon::Crayons.Crayon,
row_number_header_crayon::Crayons.Crayon, subheader_crayon::Crayons.Crayon,
text_crayon::Crayons.Crayon, title_crayon::Crayons.Crayon)
> @ PrettyTables
~/.julia/packages/PrettyTables/oVZqx/src/backends/text/text_backend.jl:371
> [9] _print_table(io::IO, data::Any; alignment::Vector{Symbol},
backend::Val{:auto}, cell_alignment::Nothing, cell_first_line_only::Bool,
compact_printing::Bool,
formatters::Tuple{typeof(DataFrames._pretty_tables_general_formatter)},
header::Tuple{Vector{String}, Vector{String}}, header_alignment::Symbol,
header_cell_alignment::Nothing, limit_printing::Bool,
max_num_of_columns::Int64, max_num_of_rows::Int64, renderer::Symbol,
row_labels::Nothing, row_label_alignment::Symbol,
row_label_column_title::String, row_number_alignment::Symbol,
row_number_column_title::String, show_header::Bool, show_row_number::Bool,
show_subheader::Bool, title::String, title_alignment::Symbol,
kwargs::@Kwargs{alignment_anchor_fallback::Symbol,
alignment_anchor_regex::Dict{Int64, Vector{Regex}}, crop::Symbol,
ellipsis_line_skip::Int64, hlines::Vector{Symbol},
highlighters::Tuple{PrettyTables.Highlighter},
maximum_columns_width::Vector{Int64}, newline_at_end::Bool,
reserved_display_lines::Int64, row_lab
el_crayon::Crayons.Crayon, vcrop_mode::Symbol, vlines::Vector{Int64}})
> @ PrettyTables ~/.julia/packages/PrettyTables/oVZqx/src/print.jl:1059
> [10] _print_table
> @ ~/.julia/packages/PrettyTables/oVZqx/src/print.jl:934 [inlined]
> [11] #pretty_table#62
> @ ~/.julia/packages/PrettyTables/oVZqx/src/print.jl:825 [inlined]
> [12] pretty_table
> @ ~/.julia/packages/PrettyTables/oVZqx/src/print.jl:794 [inlined]
> [13] _show(io::Base.TTY, df::DataFrame; allrows::Bool, allcols::Bool,
rowlabel::Symbol, summary::Bool, eltypes::Bool, rowid::Nothing,
truncate::Int64, kwargs::@Kwargs{})
> @ DataFrames
~/.julia/packages/DataFrames/kcA9R/src/abstractdataframe/show.jl:253
> [14] _show
> @ ~/.julia/packages/DataFrames/kcA9R/src/abstractdataframe/show.jl:147
[inlined]
> [15] #show#871
> @ ~/.julia/packages/DataFrames/kcA9R/src/abstractdataframe/show.jl:352
[inlined]
> [16] show
> @ ~/.julia/packages/DataFrames/kcA9R/src/abstractdataframe/show.jl:339
[inlined]
> [17] show(io::Base.TTY, mime::MIME{Symbol("text/plain")}, df::DataFrame)
> @ DataFrames
~/.julia/packages/DataFrames/kcA9R/src/abstractdataframe/io.jl:150
> [18] display(d::TextDisplay, M::MIME{Symbol("text/plain")}, x::Any)
> @ Base.Multimedia ./multimedia.jl:254
> [19] display
> @ ./multimedia.jl:255 [inlined]
> [20] display(x::Any)
> @ Base.Multimedia ./multimedia.jl:340
> [21] |>(x::DataFrame, f::typeof(display))
> @ Base ./operators.jl:926
> [22] top-level scope
> @ none:1
> ```
>
> Also, sometimes data from the first column appears in the second column,
but only for dataframes with more than about 30 rows:
>
> ```
> python
>>> col=[7 for _ in range(40)]; import polars as pl; pl.DataFrame({'more':
['नमस्त *i for i in col],'text':['k'*i for i in col]}).write_ipc("long.arrow")
>>>
> julia --project -e "using DataFrames; import Arrow;
Arrow.Table(\"long.arrow\") |> DataFrame |> display"
> 40×2 DataFrame
> Row │ more text
> │ String? String?
> ─────┼────────────────────────────────────────────────────────
> 1 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते W1\0\0\xff\xff\xff
> 2 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \xf2\xff\xff\xff\x14\0\0
> 3 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \v\0\b\0\n\0\x04
> 4 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \b\0\b\0\0\0\x04
> 5 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \x04\0\0\0\xec\xff\xff
> 6 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \x18\0\0\0\x01\x18\0
> 7 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \x11\0\b\0\0\0\f
> 8 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \x04\0\x04\0\x04\0\0
> 9 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \xec\xff\xff\xff,\0\0
> 10 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \x01\x18\0\0\x10\0\x12
> 11 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \0\0\f\0\0\0\0
> 12 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \x04\0\0\0mor # trying
to spell "more", name of 1st column?
> 13 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \xe8\0\0\0\x04\0\0
> 14 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \0\0\0\0\x14\0\0
> 15 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \x10\0\x12\0\f\0\x04
> 16 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \0\0\0\0\x90\0\0
> 17 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \0\0\0\0\0\0\x0e
> 18 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \0\0\x14\0\x02\0\0
> 19 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \0\0\0\0\0\0\0
> 20 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \0\0\0\0\0\0\0
> 21 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \0\0\0\0\0\0\0
> 22 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \x80\x02\0\0\0\0\0
> 23 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते @\x16\0\0\0\0\0
> 24 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते @\x16\0\0\0\0\0
> 25 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \0\0\0\0\x02\0\0
> 26 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \0\0\0\0\0\0\0
> 27 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते \0\0\0\0\0\0\0
> 28 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0 # न
shouldn't be here
> 29 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> 30 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> 31 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> 32 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> 33 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> 34 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> 35 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> 36 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> 37 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> 38 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> 39 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> 40 │ नमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्तेनमस्ते न\xe0\0\0\0
> ```
>
> Pyarrow reads all of these correctly.
_Originally posted by @ForceBru in
[#540](https://github.com/apache/arrow-julia/issues/540#issuecomment-2646682912)_
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]