Repository: parquet-cpp Updated Branches: refs/heads/master 792f858c9 -> 99759a38b
PARQUET-1033: Improve documentation about WriteBatchSpaced Author: Uwe L. Korn <[email protected]> Closes #354 from xhochy/PARQUET-1033 and squashes the following commits: 895676a [Uwe L. Korn] Remove trailing comment line 709ef32 [Uwe L. Korn] PARQUET-1033: Improve documentation about WriteBatchSpaced Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/99759a38 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/99759a38 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/99759a38 Branch: refs/heads/master Commit: 99759a38b7dabf2520070949713a1e5d6853caf4 Parents: 792f858 Author: Uwe L. Korn <[email protected]> Authored: Mon Jun 19 11:26:37 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Mon Jun 19 11:26:37 2017 -0400 ---------------------------------------------------------------------- src/parquet/column/column-writer-test.cc | 30 +++++++++++++++++++++++++++ src/parquet/column/writer.h | 28 +++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/99759a38/src/parquet/column/column-writer-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc index 33eefac..6f47f3b 100644 --- a/src/parquet/column/column-writer-test.cc +++ b/src/parquet/column/column-writer-test.cc @@ -347,6 +347,36 @@ TYPED_TEST(TestPrimitiveWriter, Optional) { ASSERT_EQ(this->values_, this->values_out_); } +TYPED_TEST(TestPrimitiveWriter, OptionalSpaced) { + // Optional and non-repeated, with definition levels + // but no repetition levels + this->SetUpSchema(Repetition::OPTIONAL); + + this->GenerateData(SMALL_SIZE); + std::vector<int16_t> definition_levels(SMALL_SIZE, 1); + std::vector<uint8_t> valid_bits(::arrow::BitUtil::BytesForBits(SMALL_SIZE), 255); + + definition_levels[SMALL_SIZE - 1] = 0; + ::arrow::BitUtil::ClearBit(valid_bits.data(), SMALL_SIZE - 1); + definition_levels[1] = 0; + ::arrow::BitUtil::ClearBit(valid_bits.data(), 1); + + auto writer = this->BuildWriter(); + writer->WriteBatchSpaced(this->values_.size(), definition_levels.data(), nullptr, + valid_bits.data(), 0, this->values_ptr_); + writer->Close(); + + // PARQUET-703 + ASSERT_EQ(100, this->metadata_num_values()); + + this->ReadColumn(); + ASSERT_EQ(98, this->values_read_); + this->values_out_.resize(98); + this->values_.resize(99); + this->values_.erase(this->values_.cbegin() + 1); + ASSERT_EQ(this->values_, this->values_out_); +} + TYPED_TEST(TestPrimitiveWriter, Repeated) { // Optional and repeated, so definition and repetition levels this->SetUpSchema(Repetition::REPEATED); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/99759a38/src/parquet/column/writer.h ---------------------------------------------------------------------- diff --git a/src/parquet/column/writer.h b/src/parquet/column/writer.h index 5ffcf73..407e808 100644 --- a/src/parquet/column/writer.h +++ b/src/parquet/column/writer.h @@ -166,8 +166,32 @@ class PARQUET_EXPORT TypedColumnWriter : public ColumnWriter { void WriteBatch(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, const T* values); - // Write a batch of repetition levels, definition levels, and values to the - // column. + /// Write a batch of repetition levels, definition levels, and values to the + /// column. + /// + /// In comparision to WriteBatch the length of repetition and definition levels + /// is the same as of the number of values read for max_definition_level == 1. + /// In the case of max_definition_level > 1, the repetition and definition + /// levels are larger than the values but the values include the null entries + /// with definition_level == (max_definition_level - 1). Thus we have to differentiate + /// in the parameters of this function if the input has the length of num_values or the + /// _number of rows in the lowest nesting level_. + /// + /// In the case that the most inner node in the Parquet is required, the _number of rows + /// in the lowest nesting level_ is equal to the number of non-null values. If the + /// inner-most schema node is optional, the _number of rows in the lowest nesting level_ + /// also includes all values with definition_level == (max_definition_level - 1). + /// + /// @param num_values number of levels to write. + /// @param def_levels The Parquet definiton levels, length is num_values + /// @param rep_levels The Parquet repetition levels, length is num_values + /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting + /// level. The length is number of rows in the lowest nesting level. + /// @param valid_bits_offset The offset in bits of the valid_bits where the + /// first relevant bit resides. + /// @param values The values in the lowest nested level including + /// spacing for nulls on the lowest levels; input has the length + /// of the number of rows on the lowest nesting level. void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values);
