westonpace commented on issue #10803:
URL: https://github.com/apache/arrow/issues/10803#issuecomment-887174376


   Ah, I see the issue now.  Reading the data as a dictionary is indeed 
possible.  You will need to inform the reader that you wish to read the column 
as a dictionary.  In `pyarrow` you can see this exposed as the 
`read_dictionary` property described 
[here](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html).
   
   A short (well, for C++) C++ example is here:
   
   ```
     arrow::fs::LocalFileSystem file_system;
     ARROW_ASSIGN_OR_RAISE(auto input, 
file_system.OpenInputFile("/tmp/foo.parquet"));
   
     parquet::ArrowReaderProperties arrow_reader_properties =
         parquet::default_arrow_reader_properties();
   
     // Here we configure the reader to read the first column as a dictionary
     arrow_reader_properties.set_read_dictionary(0, true);
   
     parquet::ReaderProperties reader_properties =
         parquet::default_reader_properties();
   
     // Open Parquet file reader
     std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
     auto reader_builder = parquet::arrow::FileReaderBuilder();
     reader_builder.properties(arrow_reader_properties);
     ARROW_RETURN_NOT_OK(reader_builder.Open(std::move(input), 
reader_properties));
     ARROW_RETURN_NOT_OK(reader_builder.Build(&arrow_reader));
   
     std::shared_ptr<arrow::Table> table;
     ARROW_RETURN_NOT_OK(arrow_reader->ReadTable(&table));
   
     std::shared_ptr<arrow::Array> arr = table->column(0)->chunk(0);
     std::shared_ptr<arrow::DictionaryArray> dict_arr = 
std::dynamic_pointer_cast<arrow::DictionaryArray>(arr);
     std::shared_ptr<arrow::Int32Array> dict_indices_arr = 
std::dynamic_pointer_cast<arrow::Int32Array>(dict_arr->indices());
     std::shared_ptr<arrow::StringArray> dict_values_arr = 
std::dynamic_pointer_cast<arrow::StringArray>(dict_arr->dictionary());
   
     const int32_t* dict_indices = dict_indices_arr->raw_values();
     const int32_t* string_offsets = dict_values_arr->raw_value_offsets();
     const char* string_values = (char*)(dict_values_arr->raw_data());
   
     std::cout << "There are " << arr->length() << " items in the array" << 
std::endl;
     std::cout << "There are " << dict_values_arr->length() << " values in the 
dictionary" << std::endl;
     std::cout << "Offsets length: " << 
dict_values_arr->value_offsets()->size() << std::endl;
   
     for (int i = 0; i < arr->length(); i++) {
       std::cout << "Item: " << i << std::endl;
       std::cout << "  Dictionary Index: " << dict_indices[i] << std::endl;
       std::cout << "  Values Start: " << string_offsets[dict_indices[i]] << 
std::endl;
       std::cout << "  Values End: " << string_offsets[dict_indices[i]+1] << 
std::endl;
       std::cout << "  First Char: " << 
string_values[string_offsets[dict_indices[i]]] << std::endl;
     }
   ```
   
   What you will get back in Arrow is an `arrow::DictionaryArray` which is 
represented by one parent array, two child arrays, and a total of 5 possible 
buffers.
   
   `arr->indices()` is an `Int32Array` of the offsets (with two buffers, one 
for values and one optional validity map).
   `arr->dictionary()` is a `StringArray` of the values (with three buffers, 
one for values, one for offsets, and one optional validity map).


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to