[GitHub] [arrow] emkornfield commented on a change in pull request #10537: PARQUET-2056: [C++] Add ability for retrieving dictionary and indices separately for ColumnReader

GitBox Tue, 15 Jun 2021 20:15:01 -0700


emkornfield commented on a change in pull request #10537:
URL: https://github.com/apache/arrow/pull/10537#discussion_r652318792




##########
File path: cpp/src/parquet/column_reader_test.cc
##########
@@ -386,5 +387,77 @@ TEST_F(TestPrimitiveReader, TestDictionaryEncodedPages) {
   pages_.clear();
 }
 
+TEST_F(TestPrimitiveReader, TestDictionaryEncodedPagesWithExposeEncoding) {
+  max_def_level_ = 0;
+  max_rep_level_ = 0;
+  int levels_per_page = 100;
+  int num_pages = 5;
+  const ByteArray* dict = nullptr;
+  int32_t dict_len = 0;
+  int64_t total_indices = 0;
+  int64_t indices_read = 0;
+  std::vector<int16_t> def_levels;
+  std::vector<int16_t> rep_levels;
+  std::vector<ByteArray> values;
+  std::vector<uint8_t> buffer;
+  ByteArrayReader* reader = nullptr;
+  NodePtr type = schema::ByteArray("a", Repetition::REQUIRED);
+  const ColumnDescriptor descr(type, max_def_level_, max_rep_level_);
+
+  // Fully dictionary encoded
+  MakePages<ByteArrayType>(&descr, num_pages, levels_per_page, def_levels, 
rep_levels,
+                           values, buffer, pages_, Encoding::RLE_DICTIONARY);
+  int64_t value_size = values.size();
+  auto indices = ::arrow::internal::make_unique<int32_t[]>(value_size);
+
+  InitReader(&descr);
+  reader = static_cast<ByteArrayReader*>(reader_.get());
+  while (total_indices < value_size && reader->HasNext()) {
+    const ByteArray* tmp_dict = nullptr;
+    int32_t tmp_dict_len = 0;
+    EXPECT_NO_THROW(reader->ReadBatchWithDictionary(
+        value_size, /*def_levels=*/nullptr,
+        /*rep_levels=*/nullptr, indices.get() + total_indices, &indices_read, 
&tmp_dict,
+        &tmp_dict_len));
+    if (tmp_dict) {
+      // Only reading the 1st batch will return the dictionary
+      EXPECT_EQ(total_indices, 0);
+      EXPECT_GT(tmp_dict_len, 0);
+      dict = tmp_dict;
+      dict_len = tmp_dict_len;
+    } else {
+      // Reading following batches won't return the dictionary
+      EXPECT_GT(total_indices, 0);
+      EXPECT_EQ(tmp_dict_len, 0);
+    }
+    total_indices += indices_read;
+  }
+
+  EXPECT_EQ(total_indices, value_size);
+  for (int64_t i = 0; i < total_indices; ++i) {
+    EXPECT_LT(indices[i], dict_len);
+    EXPECT_EQ(dict[indices[i]].len, values[i].len);
+    EXPECT_EQ(memcmp(dict[indices[i]].ptr, values[i].ptr, values[i].len), 0);
+  }
+  pages_.clear();
+
+  // The data page falls back to plain encoding

Review comment:
       lets make this a separate  test case.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] emkornfield commented on a change in pull request #10537: PARQUET-2056: [C++] Add ability for retrieving dictionary and indices separately for ColumnReader

Reply via email to