[
https://issues.apache.org/jira/browse/PARQUET-2095?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Micah Kornfield resolved PARQUET-2095.
--------------------------------------
Resolution: Not A Problem
> [C++] Read Parquet file with MapArray
> -------------------------------------
>
> Key: PARQUET-2095
> URL: https://issues.apache.org/jira/browse/PARQUET-2095
> Project: Parquet
> Issue Type: New Feature
> Components: parquet-cpp
> Affects Versions: cpp-4.0.0
> Environment: arrow-apache-arrow-3.0.0
> C++ library
> Linux operating system
> Reporter: jiang,longshan
> Priority: Blocker
> Labels: beginner, newbie
> Fix For: cpp-6.0.0
>
> Attachments: image-2021-09-26-20-36-27-621.png
>
>
> Parquet format can reduce storage space effectively, and we use the format
> with hdfs+Hive Jni(call c++)+Spark Jni(call c++), and it works well. Now we
> are starting a new project only use c++ language with higher performance
> expectation, but we meet a blocking issue on how to read the parquet file
> with MapArray such as
> list<array_element: map<string, list<array_element: int64>>>
> list<array_element: map<string, string>>
> map<string, list<array_element: int64>>
>
> And I know how to work well only without map struct such as
> list<array_element: string>, list<array_element: list<array_element: string>>
> Here is the code example, please give me some advice on how to read parquet
> file with map type, thanks a lot!
>
> {code:java}
> // code placeholder
> #include "gflags/gflags.h"
> #include "arrow/api.h"
> #include "arrow/array/builder_base.h"
> #include "arrow/filesystem/hdfs.h"
> #include "arrow/io/api.h"
> #include "parquet/arrow/reader.h"
> #include "parquet/column_reader.h"
> #include "parquet/exception.h"
> #include "parquet/arrow/reader.h"
> int main(int argc, char** argv) {
> gflags::ParseCommandLineFlags(&argc, &argv, true);
> arrow::Status st;
> arrow::MemoryPool* pool = ::arrow::default_memory_pool();
> std::shared_ptr<arrow::io::RandomAccessFile> input = nullptr;
> std::shared_ptr<::arrow::io::RandomAccessFile> _infile;
> PARQUET_ASSIGN_OR_THROW(
> _infile,
> ::arrow::io::ReadableFile::Open(FLAGS_input_file,
> ::arrow::default_memory_pool()));
> // Open Parquet file reader
> std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
> st = parquet::arrow::OpenFile(_infile, pool, &arrow_reader);
> if (!st.ok()) {
> LOG(ERROR) << "open file failed " << FLAGS_input_file;
> return 0;
> }
> // Read entire file as a single Arrow table
> std::shared_ptr<arrow::Table> table;
> st = arrow_reader->ReadTable(&table);
> if (!st.ok()) {
> LOG(INFO) << "read file to table successfully " <<
> FLAGS_input_file;
> }
>
> size_t num_cols = table->num_columns();
> for (size_t idx = 0; idx < num_cols; idx++) {
> auto this_field = table->field(idx);
> auto this_column = table->column(idx);
> if (this_field->name() == "lls_column") { // works well type:
> list<array_element: list<array_element: string>>
> for (size_t c_idx = 0; c_idx < this_column->num_chunks(); c_idx++) {
> auto row_array =
>
> std::static_pointer_cast<arrow::ListArray>(this_column->chunk(c_idx));
> auto sample_array =
> std::static_pointer_cast<arrow::ListArray>(row_array->values());
> auto id_array =
>
> std::static_pointer_cast<arrow::StringArray>(sample_array->values());
> for (int64_t i = 0; i < table->num_rows(); i++) {
> auto offset = row_array->value_offset(i);
> auto count = row_array->value_length(i);
> for (auto x = 0; x < count; x++) {
> std::vector<std::string> result;
> auto sample_offset = sample_array->value_offset(offset+x);
> auto id_count = sample_array->value_length(offset+x);
> for (auto id = 0; id < id_count; id++) {
> int32_t len;
> const uint8_t* addr = id_array->GetValue(sample_offset + id,
> &len);
> result.push_back(std::string(reinterpret_cast<const
> char*>(addr), (int16_t)len));
> }
> LOG(INFO) << "LLS " << count << " " << this_field->name() << "
> " << to_string(result); // works well
> }
> }
> }
> }
> else if (this_field->name() == "ms2li_column") { // MS2LI type:
> map<string, list<array_element: int64>>
> LOG(INFO) << "col name: " << this_field->name() << " type: " <<
> this_field->type()->ToString();
> LOG(INFO) << "length: " << this_column->length() << " chunk num: "
> << this_column->num_chunks();
> for (size_t c_idx = 0; c_idx < this_column->num_chunks(); c_idx++) {
> auto row_array =
>
> std::static_pointer_cast<arrow::MapArray>(this_column->chunk(c_idx));
> auto keys_array =
> std::static_pointer_cast<arrow::StringArray>(row_array->keys());
> auto item_array =
> std::static_pointer_cast<arrow::ListArray>(row_array->items());
> auto item_value_array =
> std::static_pointer_cast<arrow::ListArray>(item_array->values());
> auto id_array =
>
> std::static_pointer_cast<arrow::Int64Array>(item_value_array->values());
> // I've no idea how to traverse the map<string, list<array_element:
> int64>> to get key and value correctly,
> }
> }
> }
> {code}
> It seems that arrow::MayArray :: keys() and items() lose each map pair's
> offset, and cannot find the right pair in list<array_element: map<string,
> string>> format. Really need and appreciate your help.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)