[GitHub] [arrow] emkornfield commented on a change in pull request #10537: PARQUET-2056: [C++] Add ability for retrieving dictionary and indices separately for ColumnReader

GitBox Tue, 15 Jun 2021 20:28:59 -0700


emkornfield commented on a change in pull request #10537:
URL: https://github.com/apache/arrow/pull/10537#discussion_r652322882




##########
File path: cpp/src/parquet/reader_test.cc
##########
@@ -474,6 +475,83 @@ TEST(TestJSONWithLocalFile, JSONOutput) {
   ASSERT_EQ(json_output, ss.str());
 }
 
+TEST(TestFileReader, BufferedReadsWithDictionary) {
+  const int num_rows = 1000;
+
+  // Make schema
+  schema::NodeVector fields;
+  fields.push_back(PrimitiveNode::Make("field", Repetition::REQUIRED, 
Type::DOUBLE,
+                                       ConvertedType::NONE));
+  auto schema = std::static_pointer_cast<GroupNode>(
+      GroupNode::Make("schema", Repetition::REQUIRED, fields));
+
+  // Write small batches and small data pages
+  std::shared_ptr<WriterProperties> writer_props = WriterProperties::Builder()
+                                                       .write_batch_size(64)
+                                                       ->data_pagesize(128)
+                                                       ->enable_dictionary()
+                                                       ->build();
+
+  ASSERT_OK_AND_ASSIGN(auto out_file, 
::arrow::io::BufferOutputStream::Create());
+  std::shared_ptr<ParquetFileWriter> file_writer =
+      ParquetFileWriter::Open(out_file, schema, writer_props);
+
+  RowGroupWriter* rg_writer = file_writer->AppendRowGroup();
+
+  // write one column
+  ::arrow::random::RandomArrayGenerator rag(0);
+  DoubleWriter* writer = static_cast<DoubleWriter*>(rg_writer->NextColumn());
+  std::shared_ptr<::arrow::Array> col = rag.Float64(num_rows, 0, 100);
+  const auto& col_typed = static_cast<const ::arrow::DoubleArray&>(*col);
+  writer->WriteBatch(num_rows, nullptr, nullptr, col_typed.raw_values());
+  rg_writer->Close();
+  file_writer->Close();
+
+  // Open the reader
+  ASSERT_OK_AND_ASSIGN(auto file_buf, out_file->Finish());
+  auto in_file = std::make_shared<::arrow::io::BufferReader>(file_buf);
+
+  ReaderProperties reader_props;
+  reader_props.enable_buffered_stream();
+  reader_props.set_buffer_size(64);
+  std::unique_ptr<ParquetFileReader> file_reader =
+      ParquetFileReader::Open(in_file, reader_props);
+
+  auto row_group = file_reader->RowGroup(0);
+  auto col_reader = std::static_pointer_cast<DoubleReader>(
+      row_group->ColumnWithExposeEncoding(0, ExposedEncodingType::DICTIONARY));
+  EXPECT_EQ(col_reader->GetExposedEncoding(), ExposedEncodingType::DICTIONARY);
+
+  auto indices = ::arrow::internal::make_unique<int32_t[]>(num_rows);
+  const double* dict = nullptr;
+  int32_t dict_len = 0;
+  for (int row_index = 0; row_index < num_rows; ++row_index) {
+    const double* tmp_dict = nullptr;
+    int32_t tmp_dict_len = 0;
+    int64_t values_read = 0;
+    int64_t levels_read = col_reader->ReadBatchWithDictionary(
+        1, nullptr, nullptr, indices.get() + row_index, &values_read, 
&tmp_dict,

Review comment:
       please comment literals.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] emkornfield commented on a change in pull request #10537: PARQUET-2056: [C++] Add ability for retrieving dictionary and indices separately for ColumnReader

Reply via email to