[ 
https://issues.apache.org/jira/browse/ARROW-10377?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Tanguy Fautre updated ARROW-10377:
----------------------------------
    Description: 
I'm updating ParquetSharp to build against Arrow 2.0.0 (currently using Arrow 
1.0.1). One of our unit test is now throwing a {{nullptr}} access violation.

I have narrowed it down to writing arrays of non-nullable values (in this case 
the column contains {{int[]}}) . If the values are nullable, the test passes.

The parquet file schema is as following:
* {{GroupNode("schema", LogicalType.None, Repetition.Required)}}
** {{GroupNode("array_of_ints_column", LogicalType.List, Repetition.Optional)}}
*** {{GroupNode("list", LogicalType.None, Repetition.Repeated)}}
**** {{PrimitiveNode("item", LogicalType.Int(32, signed), 
Repetition.Required)}} 

The test crashes when calling {{TypedColumnWriter::WriteBatchSpaced}} with the 
following  arguments:
* {{num_values = 1}}
* {{def_levels = {0}}}
* {{rep_levels = {0}}}
* {{valid_bits = {0}}}
* {{valid_bit_offset = 0}}
* {{values = {}}} (i.e. {{nullptr}})

This call is effectively trying to write a null array, and therefore (to my 
understanding) does not need to pass any values. Yet further down the 
callstack, the implementation tries to read one value out of {{values}} (which 
is {{nullptr}}).

I believe the problem lies with
{code:c++}
  void MaybeCalculateValidityBits(
    const int16_t* def_levels,
    int64_t batch_size,
    int64_t* out_values_to_write,
    int64_t* out_spaced_values_to_write,
    int64_t* null_count) {
    if (bits_buffer_ == nullptr) {
      if (!level_info_.HasNullableValues()) {
        *out_values_to_write = batch_size;
        *out_spaced_values_to_write = batch_size;
        *null_count = 0;
      } else {
        for (int x = 0; x < batch_size; x++) {
          *out_values_to_write += def_levels[x] == level_info_.def_level ? 1 : 
0;
          *out_spaced_values_to_write +=
              def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 : 0;
        }
        *null_count = *out_values_to_write - *out_spaced_values_to_write;
      }
      return;
    }

    // ...
  }
{code}

In particular, {{level_info_.HasNullableValues()}} returns {{false}} given that 
the arrays cannot contain null-values. My understanding is that this is wrong, 
since the arrays themselves are nullable.

  was:
I'm updating ParquetSharp to build against Arrow 2.0.0 (currently using Arrow 
1.0.1). One of our unit test is now throwing a {{nullptr}} access violation.

I have narrowed it down to writing arrays of non-nullable values (in this case 
the column contains {{int[]}}) . If the values are nullable, the test passes.

The parquet file schema is as following:
* {{GroupNode("schema", LogicalType.None, Repetition.Required)}}
** {{GroupNode("array_of_ints_column", LogicalType.List, Repetition.Optional)}}
*** {{GroupNode("list", LogicalType.None, Repetition.Repeated)}}
**** {{PrimitiveNode("item", LogicalType.Int(32, signed), 
Repetition.Required)}} 

The test crashes when calling {{TypedColumnWriter::WriteBatchSpaced}} with the 
following  arguments:
* {{num_values = 1}}
* {{def_levels = &#123;0&#125;}}
* {{rep_levels = &#123;0&#125;}}
* {{valid_bits = &#123;0&#125;}}
* {{valid_bit_offset = 0}}
* {{values = &#123;&#125;}}

This call is effectively trying to write a null array, and therefore (to my 
understanding) does not need to pass any values and sets {{values = nullptr}}.

I believe the problem lies with
{code:c++}
  void MaybeCalculateValidityBits(
    const int16_t* def_levels,
    int64_t batch_size,
    int64_t* out_values_to_write,
    int64_t* out_spaced_values_to_write,
    int64_t* null_count) {
    if (bits_buffer_ == nullptr) {
      if (!level_info_.HasNullableValues()) {
        *out_values_to_write = batch_size;
        *out_spaced_values_to_write = batch_size;
        *null_count = 0;
      } else {
        for (int x = 0; x < batch_size; x++) {
          *out_values_to_write += def_levels[x] == level_info_.def_level ? 1 : 
0;
          *out_spaced_values_to_write +=
              def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 : 0;
        }
        *null_count = *out_values_to_write - *out_spaced_values_to_write;
      }
      return;
    }

    // ...
  }
{code}

In particular, {{level_info_.HasNullableValues()}} returns {{false}} given that 
the arrays cannot contain null-values. My understanding is that this is wrong, 
since the arrays themselves are nullable.


> [C++][Parquet] nullptr access violation when writing arrays of non-nullable 
> values
> ----------------------------------------------------------------------------------
>
>                 Key: ARROW-10377
>                 URL: https://issues.apache.org/jira/browse/ARROW-10377
>             Project: Apache Arrow
>          Issue Type: Bug
>    Affects Versions: 2.0.0
>            Reporter: Tanguy Fautre
>            Priority: Major
>
> I'm updating ParquetSharp to build against Arrow 2.0.0 (currently using Arrow 
> 1.0.1). One of our unit test is now throwing a {{nullptr}} access violation.
> I have narrowed it down to writing arrays of non-nullable values (in this 
> case the column contains {{int[]}}) . If the values are nullable, the test 
> passes.
> The parquet file schema is as following:
> * {{GroupNode("schema", LogicalType.None, Repetition.Required)}}
> ** {{GroupNode("array_of_ints_column", LogicalType.List, 
> Repetition.Optional)}}
> *** {{GroupNode("list", LogicalType.None, Repetition.Repeated)}}
> **** {{PrimitiveNode("item", LogicalType.Int(32, signed), 
> Repetition.Required)}} 
> The test crashes when calling {{TypedColumnWriter::WriteBatchSpaced}} with 
> the following  arguments:
> * {{num_values = 1}}
> * {{def_levels = &#123;0&#125;}}
> * {{rep_levels = &#123;0&#125;}}
> * {{valid_bits = &#123;0&#125;}}
> * {{valid_bit_offset = 0}}
> * {{values = &#123;&#125;}} (i.e. {{nullptr}})
> This call is effectively trying to write a null array, and therefore (to my 
> understanding) does not need to pass any values. Yet further down the 
> callstack, the implementation tries to read one value out of {{values}} 
> (which is {{nullptr}}).
> I believe the problem lies with
> {code:c++}
>   void MaybeCalculateValidityBits(
>     const int16_t* def_levels,
>     int64_t batch_size,
>     int64_t* out_values_to_write,
>     int64_t* out_spaced_values_to_write,
>     int64_t* null_count) {
>     if (bits_buffer_ == nullptr) {
>       if (!level_info_.HasNullableValues()) {
>         *out_values_to_write = batch_size;
>         *out_spaced_values_to_write = batch_size;
>         *null_count = 0;
>       } else {
>         for (int x = 0; x < batch_size; x++) {
>           *out_values_to_write += def_levels[x] == level_info_.def_level ? 1 
> : 0;
>           *out_spaced_values_to_write +=
>               def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 : 
> 0;
>         }
>         *null_count = *out_values_to_write - *out_spaced_values_to_write;
>       }
>       return;
>     }
>     // ...
>   }
> {code}
> In particular, {{level_info_.HasNullableValues()}} returns {{false}} given 
> that the arrays cannot contain null-values. My understanding is that this is 
> wrong, since the arrays themselves are nullable.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to