[ 
https://issues.apache.org/jira/browse/PARQUET-2095?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Micah Kornfield resolved PARQUET-2095.
--------------------------------------
    Resolution: Not A Problem

> [C++] Read Parquet file with MapArray
> -------------------------------------
>
>                 Key: PARQUET-2095
>                 URL: https://issues.apache.org/jira/browse/PARQUET-2095
>             Project: Parquet
>          Issue Type: New Feature
>          Components: parquet-cpp
>    Affects Versions: cpp-4.0.0
>         Environment: arrow-apache-arrow-3.0.0
> C++ library
> Linux operating system
>            Reporter: jiang,longshan
>            Priority: Blocker
>              Labels: beginner, newbie
>             Fix For: cpp-6.0.0
>
>         Attachments: image-2021-09-26-20-36-27-621.png
>
>
> Parquet format can reduce storage space effectively, and we use the format 
> with hdfs+Hive Jni(call c++)+Spark Jni(call c++), and it works well. Now we 
> are starting a new project only use c++ language with higher performance 
> expectation, but we meet a blocking issue on how to read the parquet file 
> with MapArray such as 
> list<array_element: map<string, list<array_element: int64>>>
> list<array_element: map<string, string>>
> map<string, list<array_element: int64>>
>  
> And I know how to  work well only without map struct such as 
> list<array_element: string>, list<array_element: list<array_element: string>>
> Here is the code example, please give me some advice on how to read parquet 
> file with map type, thanks a lot!
>  
> {code:java}
> // code placeholder
> #include "gflags/gflags.h"
> #include "arrow/api.h"
> #include "arrow/array/builder_base.h"
> #include "arrow/filesystem/hdfs.h"
> #include "arrow/io/api.h"
> #include "parquet/arrow/reader.h"
> #include "parquet/column_reader.h"
> #include "parquet/exception.h"
> #include "parquet/arrow/reader.h"
> int main(int argc, char** argv) {
>     gflags::ParseCommandLineFlags(&argc, &argv, true);
>     arrow::Status st;
>     arrow::MemoryPool* pool = ::arrow::default_memory_pool();
>     std::shared_ptr<arrow::io::RandomAccessFile> input = nullptr;
>     std::shared_ptr<::arrow::io::RandomAccessFile> _infile;
>     PARQUET_ASSIGN_OR_THROW(
>                     _infile,
>                     ::arrow::io::ReadableFile::Open(FLAGS_input_file,
>                             ::arrow::default_memory_pool()));
>     // Open Parquet file reader
>     std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
>     st = parquet::arrow::OpenFile(_infile, pool, &arrow_reader);
>     if (!st.ok()) {
>             LOG(ERROR) << "open file failed " << FLAGS_input_file;
>             return 0;
>     }
>     // Read entire file as a single Arrow table
>     std::shared_ptr<arrow::Table> table;
>     st = arrow_reader->ReadTable(&table);
>     if (!st.ok()) {
>             LOG(INFO) << "read file to table successfully " << 
> FLAGS_input_file;
>     }   
>      
>     size_t num_cols = table->num_columns();
>     for (size_t idx = 0; idx < num_cols; idx++) {
>       auto this_field = table->field(idx);
>       auto this_column = table->column(idx);
>     if (this_field->name() == "lls_column") { // works well type: 
> list<array_element: list<array_element: string>>
>         for (size_t c_idx = 0; c_idx < this_column->num_chunks(); c_idx++) {
>           auto row_array =
>             
> std::static_pointer_cast<arrow::ListArray>(this_column->chunk(c_idx));
>           auto sample_array =
>             std::static_pointer_cast<arrow::ListArray>(row_array->values());
>           auto id_array =
>             
> std::static_pointer_cast<arrow::StringArray>(sample_array->values());
>           for (int64_t i = 0; i < table->num_rows(); i++) {
>             auto offset = row_array->value_offset(i);
>             auto count = row_array->value_length(i);
>             for (auto x = 0; x < count; x++) {
>               std::vector<std::string> result;
>               auto sample_offset = sample_array->value_offset(offset+x);
>               auto id_count = sample_array->value_length(offset+x);
>               for (auto id = 0; id < id_count; id++) {
>                 int32_t len;
>                 const uint8_t* addr = id_array->GetValue(sample_offset + id, 
> &len);
>                 result.push_back(std::string(reinterpret_cast<const 
> char*>(addr), (int16_t)len));
>               }
>               LOG(INFO) << "LLS " << count << " " << this_field->name() << " 
> " << to_string(result); // works well
>             }
>           }
>         }
>       }
>       else if (this_field->name() == "ms2li_column") { // MS2LI type: 
> map<string, list<array_element: int64>> 
>         LOG(INFO)  << "col name: " << this_field->name() << " type: " << 
> this_field->type()->ToString();
>         LOG(INFO)  << "length: " << this_column->length() << " chunk num: " 
> << this_column->num_chunks();
>         for (size_t c_idx = 0; c_idx < this_column->num_chunks(); c_idx++) {
>           auto row_array =
>             
> std::static_pointer_cast<arrow::MapArray>(this_column->chunk(c_idx));
>           auto keys_array =
>             std::static_pointer_cast<arrow::StringArray>(row_array->keys());
>           auto item_array =
>             std::static_pointer_cast<arrow::ListArray>(row_array->items());
>           auto item_value_array =
>             std::static_pointer_cast<arrow::ListArray>(item_array->values());
>           auto id_array =
>             
> std::static_pointer_cast<arrow::Int64Array>(item_value_array->values());
>         // I've no idea how to traverse the map<string, list<array_element: 
> int64>> to get key and value correctly, 
>        }
>       }
>     }
> {code}
> It seems that arrow::MayArray :: keys() and items() lose each map pair's 
> offset, and cannot find the right pair in list<array_element: map<string, 
> string>> format. Really need and appreciate your help.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to