[
https://issues.apache.org/jira/browse/ARROW-10377?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Tanguy Fautre updated ARROW-10377:
----------------------------------
Description:
I'm updating ParquetSharp to build against Arrow 2.0.0 (currently using Arrow
1.0.1). One of our unit test is now throwing a {{nullptr}} access violation.
I have narrowed it down to writing arrays of non-nullable values (in this case
the column contains {{int[]}}) . If the values are nullable, the test passes.
The parquet file schema is as following:
* {{GroupNode("schema", LogicalType.None, Repetition.Required)}}
** {{GroupNode("array_of_ints_column", LogicalType.List, Repetition.Optional)}}
*** {{GroupNode("list", LogicalType.None, Repetition.Repeated)}}
**** {{PrimitiveNode("item", LogicalType.Int(32, signed),
Repetition.Required)}}
The test crashes when calling {{TypedColumnWriter::WriteBatchSpaced}} with the
following arguments:
* {{num_values = 1}}
* {{def_levels = {0}}}
* {{rep_levels = {0}}}
* {{valid_bits = {0}}}
* {{valid_bit_offset = 0}}
* {{values = {}}}
This call is effectively trying to write a null array, and therefore (to my
understanding) does not need to pass any values and sets {{values = nullptr}}.
I believe the problem lies with
{code:c++}
void MaybeCalculateValidityBits(
const int16_t* def_levels,
int64_t batch_size,
int64_t* out_values_to_write,
int64_t* out_spaced_values_to_write,
int64_t* null_count) {
if (bits_buffer_ == nullptr) {
if (!level_info_.HasNullableValues()) {
*out_values_to_write = batch_size;
*out_spaced_values_to_write = batch_size;
*null_count = 0;
} else {
for (int x = 0; x < batch_size; x++) {
*out_values_to_write += def_levels[x] == level_info_.def_level ? 1 :
0;
*out_spaced_values_to_write +=
def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 : 0;
}
*null_count = *out_values_to_write - *out_spaced_values_to_write;
}
return;
}
// ...
}
{code}
In particular, {{level_info_.HasNullableValues()}} returns {{false}} given that
the arrays cannot contain null-values. My understanding is that this is wrong,
since the arrays themselves are nullable.
was:
I'm updating ParquetSharp to build against Arrow 2.0.0 (currently using Arrow
1.0.1). One of our unit test is now throwing a {{nullptr}} access violation.
I have narrowed it down to writing arrays of non-nullable values (in this case
the column contains {{int[]}}) . If the values are nullable, the test passes.
The test crashes when calling {{TypedColumnWriter::WriteBatchSpaced}} with the
following arguments:
* {{num_values = 1}}
* {{def_levels = {0}}}
* {{rep_levels = {0}}}
* {{valid_bits = {0}}}
* {{valid_bit_offset = 0}}
* {{values = {}}}
This call is effectively trying to write a null array, and therefore (to my
understanding) does not need to pass any values and sets {{values = nullptr}}.
I believe the problem lies with
{code:c++}
void MaybeCalculateValidityBits(
const int16_t* def_levels,
int64_t batch_size,
int64_t* out_values_to_write,
int64_t* out_spaced_values_to_write,
int64_t* null_count) {
if (bits_buffer_ == nullptr) {
if (!level_info_.HasNullableValues()) {
*out_values_to_write = batch_size;
*out_spaced_values_to_write = batch_size;
*null_count = 0;
} else {
for (int x = 0; x < batch_size; x++) {
*out_values_to_write += def_levels[x] == level_info_.def_level ? 1 :
0;
*out_spaced_values_to_write +=
def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 : 0;
}
*null_count = *out_values_to_write - *out_spaced_values_to_write;
}
return;
}
// ...
}
{code}
In particular, {{level_info_.HasNullableValues()}} returns {{false}} given that
the arrays cannot contain null-values. My understanding is that this is wrong,
since the arrays themselves are nullable.
> [C++][Parquet] nullptr access violation when writing arrays of non-nullable
> values
> ----------------------------------------------------------------------------------
>
> Key: ARROW-10377
> URL: https://issues.apache.org/jira/browse/ARROW-10377
> Project: Apache Arrow
> Issue Type: Bug
> Affects Versions: 2.0.0
> Reporter: Tanguy Fautre
> Priority: Major
>
> I'm updating ParquetSharp to build against Arrow 2.0.0 (currently using Arrow
> 1.0.1). One of our unit test is now throwing a {{nullptr}} access violation.
> I have narrowed it down to writing arrays of non-nullable values (in this
> case the column contains {{int[]}}) . If the values are nullable, the test
> passes.
> The parquet file schema is as following:
> * {{GroupNode("schema", LogicalType.None, Repetition.Required)}}
> ** {{GroupNode("array_of_ints_column", LogicalType.List,
> Repetition.Optional)}}
> *** {{GroupNode("list", LogicalType.None, Repetition.Repeated)}}
> **** {{PrimitiveNode("item", LogicalType.Int(32, signed),
> Repetition.Required)}}
> The test crashes when calling {{TypedColumnWriter::WriteBatchSpaced}} with
> the following arguments:
> * {{num_values = 1}}
> * {{def_levels = {0}}}
> * {{rep_levels = {0}}}
> * {{valid_bits = {0}}}
> * {{valid_bit_offset = 0}}
> * {{values = {}}}
> This call is effectively trying to write a null array, and therefore (to my
> understanding) does not need to pass any values and sets {{values = nullptr}}.
> I believe the problem lies with
> {code:c++}
> void MaybeCalculateValidityBits(
> const int16_t* def_levels,
> int64_t batch_size,
> int64_t* out_values_to_write,
> int64_t* out_spaced_values_to_write,
> int64_t* null_count) {
> if (bits_buffer_ == nullptr) {
> if (!level_info_.HasNullableValues()) {
> *out_values_to_write = batch_size;
> *out_spaced_values_to_write = batch_size;
> *null_count = 0;
> } else {
> for (int x = 0; x < batch_size; x++) {
> *out_values_to_write += def_levels[x] == level_info_.def_level ? 1
> : 0;
> *out_spaced_values_to_write +=
> def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 :
> 0;
> }
> *null_count = *out_values_to_write - *out_spaced_values_to_write;
> }
> return;
> }
> // ...
> }
> {code}
> In particular, {{level_info_.HasNullableValues()}} returns {{false}} given
> that the arrays cannot contain null-values. My understanding is that this is
> wrong, since the arrays themselves are nullable.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)