[jira] [Commented] (PARQUET-979) [C++] Limit size of min, max or disable stats for long binary types
[ https://issues.apache.org/jira/browse/PARQUET-979?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16482149#comment-16482149 ] ASF GitHub Bot commented on PARQUET-979: majetideepak closed pull request #465: PARQUET-979: Limit size of min, max or disable stats for long binary types URL: https://github.com/apache/parquet-cpp/pull/465 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc index 47226a3c..6d7e1ebb 100644 --- a/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/src/parquet/arrow/arrow-reader-writer-test.cc @@ -1445,8 +1445,8 @@ TEST(TestArrowReadWrite, CoerceTimestampsAndSupportDeprecatedInt96) { using ::arrow::Schema; using ::arrow::Table; using ::arrow::TimeUnit; - using ::arrow::TimestampType; using ::arrow::TimestampBuilder; + using ::arrow::TimestampType; using ::arrow::default_memory_pool; auto timestamp_type = std::make_shared(TimeUnit::NANO); diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc index 7a5f379b..aac582a2 100644 --- a/src/parquet/column_writer-test.cc +++ b/src/parquet/column_writer-test.cc @@ -71,19 +71,21 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { int64_t output_size = SMALL_SIZE, const ColumnProperties& column_properties = ColumnProperties()) { sink_.reset(new InMemoryOutputStream()); -metadata_ = ColumnChunkMetaDataBuilder::Make( -writer_properties_, this->descr_, reinterpret_cast(&thrift_metadata_)); -std::unique_ptr pager = -PageWriter::Open(sink_.get(), column_properties.codec, metadata_.get()); WriterProperties::Builder wp_builder; -if (column_properties.encoding == Encoding::PLAIN_DICTIONARY || -column_properties.encoding == Encoding::RLE_DICTIONARY) { +if (column_properties.encoding() == Encoding::PLAIN_DICTIONARY || +column_properties.encoding() == Encoding::RLE_DICTIONARY) { wp_builder.enable_dictionary(); } else { wp_builder.disable_dictionary(); - wp_builder.encoding(column_properties.encoding); + wp_builder.encoding(column_properties.encoding()); } +wp_builder.max_statistics_size(column_properties.max_statistics_size()); writer_properties_ = wp_builder.build(); + +metadata_ = ColumnChunkMetaDataBuilder::Make( +writer_properties_, this->descr_, reinterpret_cast(&thrift_metadata_)); +std::unique_ptr pager = +PageWriter::Open(sink_.get(), column_properties.compression(), metadata_.get()); std::shared_ptr writer = ColumnWriter::Make(metadata_.get(), std::move(pager), writer_properties_.get()); return std::static_pointer_cast>(writer); @@ -173,6 +175,16 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { return metadata_accessor->num_values(); } + bool metadata_is_stats_set() { +// Metadata accessor must be created lazily. +// This is because the ColumnChunkMetaData semantics dictate the metadata object is +// complete (no changes to the metadata buffer can be made after instantiation) +ApplicationVersion app_version(this->writer_properties_->created_by()); +auto metadata_accessor = ColumnChunkMetaData::Make( +reinterpret_cast(&thrift_metadata_), this->descr_, &app_version); +return metadata_accessor->is_stats_set(); + } + std::vector metadata_encodings() { // Metadata accessor must be created lazily. // This is because the ColumnChunkMetaData semantics dictate the metadata object is @@ -520,6 +532,50 @@ TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) { } } +// PARQUET-979 +// Prevent writing large stats +using TestByteArrayValuesWriter = TestPrimitiveWriter; +TEST_F(TestByteArrayValuesWriter, OmitStats) { + int min_len = 1024 * 4; + int max_len = 1024 * 8; + this->SetUpSchema(Repetition::REQUIRED); + auto writer = this->BuildWriter(); + + values_.resize(SMALL_SIZE); + InitWideByteArrayValues(SMALL_SIZE, this->values_, this->buffer_, min_len, max_len); + writer->WriteBatch(SMALL_SIZE, nullptr, nullptr, this->values_.data()); + writer->Close(); + + ASSERT_FALSE(this->metadata_is_stats_set()); +} + +TEST_F(TestByteArrayValuesWriter, LimitStats) { + int min_len = 1024 * 4; + int max_len = 1024 * 8; + this->SetUpSchema(Repetition::REQUIRED); + ColumnProperties column_properties; + column_properties.set_max_statistics_size(static_cast(max_len)); + auto writer = this->BuildWriter(SMALL_SIZE, column_properties); + + values_.resize(SMALL_SIZE); + InitWideByteArrayValues(SMALL_SIZE, this->values_, this->buffer_, min_len, max_le
[jira] [Commented] (PARQUET-979) [C++] Limit size of min, max or disable stats for long binary types
[ https://issues.apache.org/jira/browse/PARQUET-979?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16479682#comment-16479682 ] ASF GitHub Bot commented on PARQUET-979: xhochy commented on a change in pull request #465: PARQUET-979: Limit size of min, max or disable stats for long binary types URL: https://github.com/apache/parquet-cpp/pull/465#discussion_r189094301 ## File path: src/parquet/properties.h ## @@ -348,19 +357,23 @@ class PARQUET_EXPORT WriterProperties { } Encoding::type encoding(const std::shared_ptr& path) const { -return column_properties(path).encoding; +return column_properties(path).encoding_; } Compression::type compression(const std::shared_ptr& path) const { -return column_properties(path).codec; +return column_properties(path).codec_; } bool dictionary_enabled(const std::shared_ptr& path) const { -return column_properties(path).dictionary_enabled; +return column_properties(path).dictionary_enabled_; } bool statistics_enabled(const std::shared_ptr& path) const { -return column_properties(path).statistics_enabled; +return column_properties(path).statistics_enabled_; + } + + size_t max_stats_size(const std::shared_ptr& path) const { Review comment: Please use `max_statistics_size` This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [C++] Limit size of min, max or disable stats for long binary types > --- > > Key: PARQUET-979 > URL: https://issues.apache.org/jira/browse/PARQUET-979 > Project: Parquet > Issue Type: Bug > Components: parquet-cpp >Reporter: Deepak Majeti >Assignee: Deepak Majeti >Priority: Major > Fix For: cpp-1.4.0 > > > Other Parquet implementations like parquet-mr disable min/max values for long > binary types > 4KB. For known logical types comparisons, we could approximate > min/max values. We need to implement this. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (PARQUET-979) [C++] Limit size of min, max or disable stats for long binary types
[ https://issues.apache.org/jira/browse/PARQUET-979?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16479679#comment-16479679 ] ASF GitHub Bot commented on PARQUET-979: majetideepak commented on a change in pull request #465: PARQUET-979: Limit size of min, max or disable stats for long binary types URL: https://github.com/apache/parquet-cpp/pull/465#discussion_r189093973 ## File path: src/parquet/column_writer.cc ## @@ -33,6 +33,8 @@ namespace parquet { +static constexpr int MAX_STATS_SIZE = 4096; // limit stats to 4k Review comment: I added it to `ColumnProperties` since it gives more flexibility. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [C++] Limit size of min, max or disable stats for long binary types > --- > > Key: PARQUET-979 > URL: https://issues.apache.org/jira/browse/PARQUET-979 > Project: Parquet > Issue Type: Bug > Components: parquet-cpp >Reporter: Deepak Majeti >Assignee: Deepak Majeti >Priority: Major > Fix For: cpp-1.4.0 > > > Other Parquet implementations like parquet-mr disable min/max values for long > binary types > 4KB. For known logical types comparisons, we could approximate > min/max values. We need to implement this. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (PARQUET-979) [C++] Limit size of min, max or disable stats for long binary types
[ https://issues.apache.org/jira/browse/PARQUET-979?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16479596#comment-16479596 ] ASF GitHub Bot commented on PARQUET-979: xhochy commented on a change in pull request #465: PARQUET-979: Limit size of min, max or disable stats for long binary types URL: https://github.com/apache/parquet-cpp/pull/465#discussion_r189076583 ## File path: src/parquet/column_writer.cc ## @@ -33,6 +33,8 @@ namespace parquet { +static constexpr int MAX_STATS_SIZE = 4096; // limit stats to 4k Review comment: The alternative would be to use this as sensible default and add it as an option to `WriterProperties`. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [C++] Limit size of min, max or disable stats for long binary types > --- > > Key: PARQUET-979 > URL: https://issues.apache.org/jira/browse/PARQUET-979 > Project: Parquet > Issue Type: Bug > Components: parquet-cpp >Reporter: Deepak Majeti >Assignee: Deepak Majeti >Priority: Major > Fix For: cpp-1.4.0 > > > Other Parquet implementations like parquet-mr disable min/max values for long > binary types > 4KB. For known logical types comparisons, we could approximate > min/max values. We need to implement this. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (PARQUET-979) [C++] Limit size of min, max or disable stats for long binary types
[ https://issues.apache.org/jira/browse/PARQUET-979?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16479593#comment-16479593 ] ASF GitHub Bot commented on PARQUET-979: majetideepak commented on a change in pull request #465: PARQUET-979: Limit size of min, max or disable stats for long binary types URL: https://github.com/apache/parquet-cpp/pull/465#discussion_r189076178 ## File path: src/parquet/column_writer.cc ## @@ -33,6 +33,8 @@ namespace parquet { +static constexpr int MAX_STATS_SIZE = 4096; // limit stats to 4k Review comment: I followed parquet-mr. https://github.com/apache/parquet-mr/blob/0d55abd05b0e5027c18e60d1ac3b22998dd00951/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L88 I don't see this specified in the spec. We should probably add it there. I will open a JIRA to do this. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [C++] Limit size of min, max or disable stats for long binary types > --- > > Key: PARQUET-979 > URL: https://issues.apache.org/jira/browse/PARQUET-979 > Project: Parquet > Issue Type: Bug > Components: parquet-cpp >Reporter: Deepak Majeti >Assignee: Deepak Majeti >Priority: Major > Fix For: cpp-1.4.0 > > > Other Parquet implementations like parquet-mr disable min/max values for long > binary types > 4KB. For known logical types comparisons, we could approximate > min/max values. We need to implement this. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (PARQUET-979) [C++] Limit size of min, max or disable stats for long binary types
[ https://issues.apache.org/jira/browse/PARQUET-979?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16479589#comment-16479589 ] ASF GitHub Bot commented on PARQUET-979: xhochy commented on a change in pull request #465: PARQUET-979: Limit size of min, max or disable stats for long binary types URL: https://github.com/apache/parquet-cpp/pull/465#discussion_r189074320 ## File path: src/parquet/column_writer.cc ## @@ -33,6 +33,8 @@ namespace parquet { +static constexpr int MAX_STATS_SIZE = 4096; // limit stats to 4k Review comment: Is this defined by the Parquet standard or arbitrarily chosen? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [C++] Limit size of min, max or disable stats for long binary types > --- > > Key: PARQUET-979 > URL: https://issues.apache.org/jira/browse/PARQUET-979 > Project: Parquet > Issue Type: Bug > Components: parquet-cpp >Reporter: Deepak Majeti >Assignee: Deepak Majeti >Priority: Major > Fix For: cpp-1.4.0 > > > Other Parquet implementations like parquet-mr disable min/max values for long > binary types > 4KB. For known logical types comparisons, we could approximate > min/max values. We need to implement this. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (PARQUET-979) [C++] Limit size of min, max or disable stats for long binary types
[ https://issues.apache.org/jira/browse/PARQUET-979?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16479586#comment-16479586 ] ASF GitHub Bot commented on PARQUET-979: majetideepak opened a new pull request #465: PARQUET-979: [C++] Limit size of min, max or disable stats for long binary types URL: https://github.com/apache/parquet-cpp/pull/465 This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [C++] Limit size of min, max or disable stats for long binary types > --- > > Key: PARQUET-979 > URL: https://issues.apache.org/jira/browse/PARQUET-979 > Project: Parquet > Issue Type: Bug > Components: parquet-cpp >Reporter: Deepak Majeti >Assignee: Deepak Majeti >Priority: Major > Fix For: cpp-1.4.0 > > > Other Parquet implementations like parquet-mr disable min/max values for long > binary types > 4KB. For known logical types comparisons, we could approximate > min/max values. We need to implement this. -- This message was sent by Atlassian JIRA (v7.6.3#76005)