[ 
https://issues.apache.org/jira/browse/ARROW-1601?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Rene Sugar updated ARROW-1601:
------------------------------
          Flags: Patch
     Attachment: 0001-ARROW-1601-increase-size-of-null_bitmap-allocation-i.patch
    Description: 
(int) byte_offset_valid_bits = 131072
(int) bit_offset_valid_bits = 0
(uint8_t) bitset_valid_bits = '?'
(int) i = 1048575

(lldb) print *num_values
(int64_t) $0 = 1048576

(lldb) print valid_bits[131072]
error: Couldn't apply expression side effects : Couldn't dematerialize a result 
variable: couldn't read its memory
(lldb) print valid_bits[131071]
(uint8_t) $6 = '?'

1048576 / 8 = 131072 (number of values in the array)

Last readable offset is 131071.

In the last iteration, READ_NEXT_BITSET reads one byte past the last byte of 
the valid_bits array.

When this allocation happens at the end of a block, a crash occurs.

These macros are used in loops in several places in Arrow and Parquet.

parquet-cpp/src/parquet/arrow/writer.cc:

Status GenerateLevels(const Array& array, const std::shared_ptr<Field>& field,
                        int64_t* values_offset, ::arrow::Type::type* 
values_type,
                        int64_t* num_values, int64_t* num_levels,
                        std::shared_ptr<Buffer>* def_levels,
                        std::shared_ptr<Buffer>* rep_levels,
                        std::shared_ptr<Array>* values_array) {
[....]
          const uint8_t* valid_bits = array.null_bitmap_data();
          INIT_BITSET(valid_bits, static_cast<int>(array.offset()));
          for (int i = 0; i < array.length(); i++) {
            if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
              def_levels_ptr[i] = 1;
            } else {
              def_levels_ptr[i] = 0;
            }
            READ_NEXT_BITSET(valid_bits);  <-- crashes here on last iteration
          }

arrow/util/bitutil.h:

#define INIT_BITSET(valid_bits_vector, valid_bits_index)        \
  int byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
  int bit_offset_##valid_bits_vector = (valid_bits_index) % 8;  \
  uint8_t bitset_##valid_bits_vector = 
valid_bits_vector[byte_offset_##valid_bits_vector];

#define READ_NEXT_BITSET(valid_bits_vector)                                     
     \
  bit_offset_##valid_bits_vector++;                                             
     \
  if (bit_offset_##valid_bits_vector == 8) {                                    
     \
    bit_offset_##valid_bits_vector = 0;                                         
     \
    byte_offset_##valid_bits_vector++;                                          
     \
    bitset_##valid_bits_vector = 
valid_bits_vector[byte_offset_##valid_bits_vector]; \
  }

A quick fix is to allocate one more byte for null_bitmap_ in ArrayBuilder::Init 
and ArrayBuilder::Resize in arrow/cpp/src/arrow/builder.cc. 

The capacity of null_bitmap() is changed by this fix.

The following tests FAILED:
          2 - array-test (Failed)
         14 - feather-test (Failed)

A more extensive fix would require changing how INIT_BITSET and 
READ_NEXT_BITSET operate where they are used in Parquet and Arrow.


To reproduce this problem:

1) Download the CSV file.

Source: https://catalog.data.gov/dataset?res_format=CSV

State Drug Utilization Data 2016
https://data.medicaid.gov/api/views/3v6v-qk5s/rows.csv?accessType=DOWNLOAD

2) Run FileConvert (see https://github.com/renesugar/FileConvert)

./bin/FileConvert -i ./State_Drug_Utilization_Data_2016.csv -o 
./State_Drug_Utilization_Data_2016.parquet 


  was:

(int) byte_offset_valid_bits = 131072
(int) bit_offset_valid_bits = 0
(uint8_t) bitset_valid_bits = '?'
(int) i = 1048575

(lldb) print *num_values
(int64_t) $0 = 1048576

(lldb) print valid_bits[131072]
error: Couldn't apply expression side effects : Couldn't dematerialize a result 
variable: couldn't read its memory
(lldb) print valid_bits[131071]
(uint8_t) $6 = '?'

1048576 / 8 = 131072 (number of values in the array)

Last readable offset is 131071.

In the last iteration, READ_NEXT_BITSET reads one byte past the last byte of 
the valid_bits array.

When this allocation happens at the end of a block, a crash occurs.

These macros are used in loops in several places in Arrow and Parquet.

parquet-cpp/src/parquet/arrow/writer.cc:

Status GenerateLevels(const Array& array, const std::shared_ptr<Field>& field,
                        int64_t* values_offset, ::arrow::Type::type* 
values_type,
                        int64_t* num_values, int64_t* num_levels,
                        std::shared_ptr<Buffer>* def_levels,
                        std::shared_ptr<Buffer>* rep_levels,
                        std::shared_ptr<Array>* values_array) {
[....]
          const uint8_t* valid_bits = array.null_bitmap_data();
          INIT_BITSET(valid_bits, static_cast<int>(array.offset()));
          for (int i = 0; i < array.length(); i++) {
            if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
              def_levels_ptr[i] = 1;
            } else {
              def_levels_ptr[i] = 0;
            }
            READ_NEXT_BITSET(valid_bits);  <-- crashes here on last iteration
          }

arrow/util/bitutil.h:

#define INIT_BITSET(valid_bits_vector, valid_bits_index)        \
  int byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
  int bit_offset_##valid_bits_vector = (valid_bits_index) % 8;  \
  uint8_t bitset_##valid_bits_vector = 
valid_bits_vector[byte_offset_##valid_bits_vector];

#define READ_NEXT_BITSET(valid_bits_vector)                                     
     \
  bit_offset_##valid_bits_vector++;                                             
     \
  if (bit_offset_##valid_bits_vector == 8) {                                    
     \
    bit_offset_##valid_bits_vector = 0;                                         
     \
    byte_offset_##valid_bits_vector++;                                          
     \
    bitset_##valid_bits_vector = 
valid_bits_vector[byte_offset_##valid_bits_vector]; \
  }

A quick fix is to allocate one more byte for null_bitmap_ in ArrayBuilder::Init 
and ArrayBuilder::Resize in arrow/cpp/src/arrow/builder.cc. 

The capacity of null_bitmap() is changed by this fix.

The following tests FAILED:
          2 - array-test (Failed)
         14 - feather-test (Failed)

A more extensive fix would require changing how INIT_BITSET and 
READ_NEXT_BITSET operate where they are used in Parquet and Arrow.


To reproduce this problem:

1) Download the CSV file.

Source: https://catalog.data.gov/dataset?res_format=CSV

State Drug Utilization Data 2016
https://data.medicaid.gov/api/views/3v6v-qk5s/rows.csv?accessType=DOWNLOAD

2) Run FileConvert (see https://github.com/renesugar/FileConvert)

./bin/FileConvert -i ./State_Drug_Utilization_Data_2016.csv -o 
./State_Drug_Utilization_Data_2016.parquet 



> READ_NEXT_BITSET reads one byte past the last byte on last iteration
> --------------------------------------------------------------------
>
>                 Key: ARROW-1601
>                 URL: https://issues.apache.org/jira/browse/ARROW-1601
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++
>    Affects Versions: 0.7.0
>         Environment: Mac OS X 10.11.6
>            Reporter: Rene Sugar
>              Labels: crash
>         Attachments: 
> 0001-ARROW-1601-increase-size-of-null_bitmap-allocation-i.patch
>
>
> (int) byte_offset_valid_bits = 131072
> (int) bit_offset_valid_bits = 0
> (uint8_t) bitset_valid_bits = '?'
> (int) i = 1048575
> (lldb) print *num_values
> (int64_t) $0 = 1048576
> (lldb) print valid_bits[131072]
> error: Couldn't apply expression side effects : Couldn't dematerialize a 
> result variable: couldn't read its memory
> (lldb) print valid_bits[131071]
> (uint8_t) $6 = '?'
> 1048576 / 8 = 131072 (number of values in the array)
> Last readable offset is 131071.
> In the last iteration, READ_NEXT_BITSET reads one byte past the last byte of 
> the valid_bits array.
> When this allocation happens at the end of a block, a crash occurs.
> These macros are used in loops in several places in Arrow and Parquet.
> parquet-cpp/src/parquet/arrow/writer.cc:
> Status GenerateLevels(const Array& array, const std::shared_ptr<Field>& field,
>                         int64_t* values_offset, ::arrow::Type::type* 
> values_type,
>                         int64_t* num_values, int64_t* num_levels,
>                         std::shared_ptr<Buffer>* def_levels,
>                         std::shared_ptr<Buffer>* rep_levels,
>                         std::shared_ptr<Array>* values_array) {
> [....]
>           const uint8_t* valid_bits = array.null_bitmap_data();
>           INIT_BITSET(valid_bits, static_cast<int>(array.offset()));
>           for (int i = 0; i < array.length(); i++) {
>             if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
>               def_levels_ptr[i] = 1;
>             } else {
>               def_levels_ptr[i] = 0;
>             }
>             READ_NEXT_BITSET(valid_bits);  <-- crashes here on last iteration
>           }
> arrow/util/bitutil.h:
> #define INIT_BITSET(valid_bits_vector, valid_bits_index)        \
>   int byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
>   int bit_offset_##valid_bits_vector = (valid_bits_index) % 8;  \
>   uint8_t bitset_##valid_bits_vector = 
> valid_bits_vector[byte_offset_##valid_bits_vector];
> #define READ_NEXT_BITSET(valid_bits_vector)                                   
>        \
>   bit_offset_##valid_bits_vector++;                                           
>        \
>   if (bit_offset_##valid_bits_vector == 8) {                                  
>        \
>     bit_offset_##valid_bits_vector = 0;                                       
>        \
>     byte_offset_##valid_bits_vector++;                                        
>        \
>     bitset_##valid_bits_vector = 
> valid_bits_vector[byte_offset_##valid_bits_vector]; \
>   }
> A quick fix is to allocate one more byte for null_bitmap_ in 
> ArrayBuilder::Init and ArrayBuilder::Resize in 
> arrow/cpp/src/arrow/builder.cc. 
> The capacity of null_bitmap() is changed by this fix.
> The following tests FAILED:
>         2 - array-test (Failed)
>        14 - feather-test (Failed)
> A more extensive fix would require changing how INIT_BITSET and 
> READ_NEXT_BITSET operate where they are used in Parquet and Arrow.
> To reproduce this problem:
> 1) Download the CSV file.
> Source: https://catalog.data.gov/dataset?res_format=CSV
> State Drug Utilization Data 2016
> https://data.medicaid.gov/api/views/3v6v-qk5s/rows.csv?accessType=DOWNLOAD
> 2) Run FileConvert (see https://github.com/renesugar/FileConvert)
> ./bin/FileConvert -i ./State_Drug_Utilization_Data_2016.csv -o 
> ./State_Drug_Utilization_Data_2016.parquet 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to