This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new a44a2745634 [Fix](parquet-reader) Fix and optimize parquet min-max
filtering. (#39375)
a44a2745634 is described below
commit a44a2745634a089581c99d394f1e333e3ebcb400
Author: Qi Chen <[email protected]>
AuthorDate: Thu Aug 15 14:12:54 2024 +0800
[Fix](parquet-reader) Fix and optimize parquet min-max filtering. (#39375)
Backport #38277.
---
be/src/vec/exec/format/parquet/parquet_common.cpp | 340 +++++++++++++++++++++
be/src/vec/exec/format/parquet/parquet_common.h | 131 +++++++-
be/src/vec/exec/format/parquet/parquet_pred_cmp.h | 142 +++++++--
.../exec/format/parquet/vparquet_page_index.cpp | 4 +-
be/src/vec/exec/format/parquet/vparquet_reader.cpp | 103 ++++++-
be/src/vec/exec/format/parquet/vparquet_reader.h | 3 +
.../parquet/parquet_corrupt_statistics_test.cpp | 134 ++++++++
.../vec/exec/parquet/parquet_statistics_test.cpp | 155 ++++++++++
be/test/vec/exec/parquet/parquet_version_test.cpp | 221 ++++++++++++++
9 files changed, 1207 insertions(+), 26 deletions(-)
diff --git a/be/src/vec/exec/format/parquet/parquet_common.cpp
b/be/src/vec/exec/format/parquet/parquet_common.cpp
index 33e9f11242b..59e12fcc71a 100644
--- a/be/src/vec/exec/format/parquet/parquet_common.cpp
+++ b/be/src/vec/exec/format/parquet/parquet_common.cpp
@@ -162,4 +162,344 @@ bool ColumnSelectVector::can_filter_all(size_t
remaining_num_values) {
void ColumnSelectVector::skip(size_t num_values) {
_filter_map_index += num_values;
}
+
+ParsedVersion::ParsedVersion(std::string application,
std::optional<std::string> version,
+ std::optional<std::string> app_build_hash)
+ : _application(std::move(application)),
+ _version(std::move(version)),
+ _app_build_hash(std::move(app_build_hash)) {}
+
+bool ParsedVersion::operator==(const ParsedVersion& other) const {
+ return _application == other._application && _version == other._version &&
+ _app_build_hash == other._app_build_hash;
+}
+
+bool ParsedVersion::operator!=(const ParsedVersion& other) const {
+ return !(*this == other);
+}
+
+size_t ParsedVersion::hash() const {
+ std::hash<std::string> hasher;
+ return hasher(_application) ^ (_version ? hasher(*_version) : 0) ^
+ (_app_build_hash ? hasher(*_app_build_hash) : 0);
+}
+
+std::string ParsedVersion::to_string() const {
+ return "ParsedVersion(application=" + _application +
+ ", semver=" + (_version ? *_version : "null") +
+ ", app_build_hash=" + (_app_build_hash ? *_app_build_hash : "null")
+ ")";
+}
+
+Status VersionParser::parse(const std::string& created_by,
+ std::unique_ptr<ParsedVersion>* parsed_version) {
+ static const std::string FORMAT =
+
"(.*?)\\s+version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?";
+ static const std::regex PATTERN(FORMAT);
+
+ std::smatch matcher;
+ if (!std::regex_match(created_by, matcher, PATTERN)) {
+ return Status::InternalError(fmt::format("Could not parse created_by:
{}, using format: {}",
+ created_by, FORMAT));
+ }
+
+ std::string application = matcher[1].str();
+ if (application.empty()) {
+ return Status::InternalError("application cannot be null or empty");
+ }
+ std::optional<std::string> semver =
+ matcher[2].str().empty() ? std::nullopt :
std::optional<std::string>(matcher[2].str());
+ std::optional<std::string> app_build_hash =
+ matcher[3].str().empty() ? std::nullopt :
std::optional<std::string>(matcher[3].str());
+ *parsed_version = std::make_unique<ParsedVersion>(application, semver,
app_build_hash);
+ return Status::OK();
+}
+
+SemanticVersion::SemanticVersion(int major, int minor, int patch)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(false),
+ _unknown(std::nullopt),
+ _pre(std::nullopt),
+ _build_info(std::nullopt) {}
+
+#ifdef BE_TEST
+SemanticVersion::SemanticVersion(int major, int minor, int patch, bool
has_unknown)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(has_unknown),
+ _unknown(std::nullopt),
+ _pre(std::nullopt),
+ _build_info(std::nullopt) {}
+#endif
+
+SemanticVersion::SemanticVersion(int major, int minor, int patch,
+ std::optional<std::string> unknown,
std::optional<std::string> pre,
+ std::optional<std::string> build_info)
+ : _major(major),
+ _minor(minor),
+ _patch(patch),
+ _prerelease(unknown.has_value() && !unknown.value().empty()),
+ _unknown(std::move(unknown)),
+ _pre(pre.has_value() ?
std::optional<Prerelease>(Prerelease(std::move(pre.value())))
+ : std::nullopt),
+ _build_info(std::move(build_info)) {}
+
+Status SemanticVersion::parse(const std::string& version,
+ std::unique_ptr<SemanticVersion>*
semantic_version) {
+ static const std::regex
pattern(R"(^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$)");
+ std::smatch match;
+
+ if (!std::regex_match(version, match, pattern)) {
+ return Status::InternalError(version + " does not match format");
+ }
+
+ int major = std::stoi(match[1].str());
+ int minor = std::stoi(match[2].str());
+ int patch = std::stoi(match[3].str());
+ std::optional<std::string> unknown =
+ match[4].str().empty() ? std::nullopt :
std::optional<std::string>(match[4].str());
+ std::optional<std::string> prerelease =
+ match[5].str().empty() ? std::nullopt :
std::optional<std::string>(match[5].str());
+ std::optional<std::string> build_info =
+ match[6].str().empty() ? std::nullopt :
std::optional<std::string>(match[6].str());
+ if (major < 0 || minor < 0 || patch < 0) {
+ return Status::InternalError("major({}), minor({}), and patch({}) must
all be >= 0", major,
+ minor, patch);
+ }
+ *semantic_version =
+ std::make_unique<SemanticVersion>(major, minor, patch, unknown,
prerelease, build_info);
+ return Status::OK();
+}
+
+int SemanticVersion::compare_to(const SemanticVersion& other) const {
+ if (int cmp = _compare_integers(_major, other._major); cmp != 0) {
+ return cmp;
+ }
+ if (int cmp = _compare_integers(_minor, other._minor); cmp != 0) {
+ return cmp;
+ }
+ if (int cmp = _compare_integers(_patch, other._patch); cmp != 0) {
+ return cmp;
+ }
+ if (int cmp = _compare_booleans(other._prerelease, _prerelease); cmp != 0)
{
+ return cmp;
+ }
+ if (_pre.has_value()) {
+ if (other._pre.has_value()) {
+ return _pre.value().compare_to(other._pre.value());
+ } else {
+ return -1;
+ }
+ } else if (other._pre.has_value()) {
+ return 1;
+ }
+ return 0;
+}
+
+bool SemanticVersion::operator==(const SemanticVersion& other) const {
+ return compare_to(other) == 0;
+}
+
+bool SemanticVersion::operator!=(const SemanticVersion& other) const {
+ return !(*this == other);
+}
+
+std::string SemanticVersion::to_string() const {
+ std::string result =
+ std::to_string(_major) + "." + std::to_string(_minor) + "." +
std::to_string(_patch);
+ if (_prerelease && _unknown) result += _unknown.value();
+ if (_pre) result += _pre.value().to_string();
+ if (_build_info) result += _build_info.value();
+ return result;
+}
+
+SemanticVersion::NumberOrString::NumberOrString(const std::string&
value_string)
+ : _original(value_string) {
+ const static std::regex NUMERIC("\\d+");
+ _is_numeric = std::regex_match(_original, NUMERIC);
+ _number = -1;
+ if (_is_numeric) {
+ _number = std::stoi(_original);
+ }
+}
+
+SemanticVersion::NumberOrString::NumberOrString(const NumberOrString& other)
+ : _original(other._original), _is_numeric(other._is_numeric),
_number(other._number) {}
+
+int SemanticVersion::NumberOrString::compare_to(const
SemanticVersion::NumberOrString& that) const {
+ if (this->_is_numeric != that._is_numeric) {
+ return this->_is_numeric ? -1 : 1;
+ }
+
+ if (_is_numeric) {
+ return this->_number - that._number;
+ }
+
+ return this->_original.compare(that._original);
+}
+
+std::string SemanticVersion::NumberOrString::to_string() const {
+ return _original;
+}
+
+bool SemanticVersion::NumberOrString::operator<(const
SemanticVersion::NumberOrString& that) const {
+ return compare_to(that) < 0;
+}
+
+bool SemanticVersion::NumberOrString::operator==(
+ const SemanticVersion::NumberOrString& that) const {
+ return compare_to(that) == 0;
+}
+
+bool SemanticVersion::NumberOrString::operator!=(
+ const SemanticVersion::NumberOrString& that) const {
+ return !(*this == that);
+}
+
+bool SemanticVersion::NumberOrString::operator>(const
SemanticVersion::NumberOrString& that) const {
+ return compare_to(that) > 0;
+}
+
+bool SemanticVersion::NumberOrString::operator<=(
+ const SemanticVersion::NumberOrString& that) const {
+ return !(*this > that);
+}
+
+bool SemanticVersion::NumberOrString::operator>=(
+ const SemanticVersion::NumberOrString& that) const {
+ return !(*this < that);
+}
+
+int SemanticVersion::_compare_integers(int x, int y) {
+ return (x < y) ? -1 : ((x == y) ? 0 : 1);
+}
+
+int SemanticVersion::_compare_booleans(bool x, bool y) {
+ return (x == y) ? 0 : (x ? 1 : -1);
+}
+
+std::vector<std::string> SemanticVersion::Prerelease::_split(const
std::string& s,
+ const std::regex&
delimiter) {
+ std::sregex_token_iterator iter(s.begin(), s.end(), delimiter, -1);
+ std::sregex_token_iterator end;
+ std::vector<std::string> tokens(iter, end);
+ return tokens;
+}
+
+SemanticVersion::Prerelease::Prerelease(std::string original) :
_original(std::move(original)) {
+ static const std::regex DOT("\\.");
+ auto parts = _split(_original, DOT);
+ for (const auto& part : parts) {
+ NumberOrString number_or_string(part);
+ _identifiers.emplace_back(number_or_string);
+ }
+}
+
+int SemanticVersion::Prerelease::compare_to(const Prerelease& that) const {
+ int size = std::min(this->_identifiers.size(), that._identifiers.size());
+ for (int i = 0; i < size; ++i) {
+ int cmp = this->_identifiers[i].compare_to(that._identifiers[i]);
+ if (cmp != 0) {
+ return cmp;
+ }
+ }
+ return static_cast<int>(this->_identifiers.size()) -
static_cast<int>(that._identifiers.size());
+}
+
+std::string SemanticVersion::Prerelease::to_string() const {
+ return _original;
+}
+
+bool SemanticVersion::Prerelease::operator<(const Prerelease& that) const {
+ return compare_to(that) < 0;
+}
+
+bool SemanticVersion::Prerelease::operator==(const Prerelease& that) const {
+ return compare_to(that) == 0;
+}
+
+bool SemanticVersion::Prerelease::operator!=(const Prerelease& that) const {
+ return !(*this == that);
+}
+
+bool SemanticVersion::Prerelease::operator>(const Prerelease& that) const {
+ return compare_to(that) > 0;
+}
+
+bool SemanticVersion::Prerelease::operator<=(const Prerelease& that) const {
+ return !(*this > that);
+}
+
+bool SemanticVersion::Prerelease::operator>=(const Prerelease& that) const {
+ return !(*this < that);
+}
+
+const SemanticVersion CorruptStatistics::PARQUET_251_FIXED_VERSION(1, 8, 0);
+const SemanticVersion CorruptStatistics::CDH_5_PARQUET_251_FIXED_START(1, 5,
0, std::nullopt,
+
"cdh5.5.0", std::nullopt);
+const SemanticVersion CorruptStatistics::CDH_5_PARQUET_251_FIXED_END(1, 5, 0);
+
+bool CorruptStatistics::should_ignore_statistics(const std::string& created_by,
+ tparquet::Type::type
physical_type) {
+ if (physical_type != tparquet::Type::BYTE_ARRAY &&
+ physical_type != tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
+ // The bug only applies to binary columns
+ return false;
+ }
+
+ if (created_by.empty()) {
+ // created_by is not populated
+ VLOG_DEBUG
+ << "Ignoring statistics because created_by is null or empty!
See PARQUET-251 and "
+ "PARQUET-297";
+ return true;
+ }
+
+ Status status;
+ std::unique_ptr<ParsedVersion> parsed_version;
+ status = VersionParser::parse(created_by, &parsed_version);
+ if (!status.ok()) {
+ VLOG_DEBUG << "Ignoring statistics because created_by could not be
parsed (see "
+ "PARQUET-251)."
+ " CreatedBy: "
+ << created_by << ", msg: " << status.msg();
+ return true;
+ }
+
+ if (parsed_version->application() != "parquet-mr") {
+ // Assume other applications don't have this bug
+ return false;
+ }
+
+ if ((!parsed_version->version().has_value()) ||
parsed_version->version().value().empty()) {
+ VLOG_DEBUG << "Ignoring statistics because created_by did not contain
a semver (see "
+ "PARQUET-251): "
+ << created_by;
+ return true;
+ }
+
+ std::unique_ptr<SemanticVersion> semantic_version;
+ status = SemanticVersion::parse(parsed_version->version().value(),
&semantic_version);
+ if (!status.ok()) {
+ VLOG_DEBUG << "Ignoring statistics because created_by could not be
parsed (see "
+ "PARQUET-251)."
+ " CreatedBy: "
+ << created_by << ", msg: " << status.msg();
+ return true;
+ }
+ if (semantic_version->compare_to(PARQUET_251_FIXED_VERSION) < 0 &&
+ !(semantic_version->compare_to(CDH_5_PARQUET_251_FIXED_START) >= 0 &&
+ semantic_version->compare_to(CDH_5_PARQUET_251_FIXED_END) < 0)) {
+ VLOG_DEBUG
+ << "Ignoring statistics because this file was created prior to
the fixed version, "
+ "see PARQUET-251";
+ return true;
+ }
+
+ // This file was created after the fix
+ return false;
+}
+
} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/parquet_common.h
b/be/src/vec/exec/format/parquet/parquet_common.h
index 2cf745882ee..da374d5fe79 100644
--- a/be/src/vec/exec/format/parquet/parquet_common.h
+++ b/be/src/vec/exec/format/parquet/parquet_common.h
@@ -17,10 +17,12 @@
#pragma once
+#include <gen_cpp/parquet_types.h>
#include <stddef.h>
#include <cstdint>
#include <ostream>
+#include <regex>
#include <string>
#include <vector>
@@ -156,4 +158,131 @@ private:
size_t _num_filtered;
size_t _read_index;
};
-} // namespace doris::vectorized
\ No newline at end of file
+
+enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER };
+
+enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN };
+
+class ParsedVersion {
+public:
+ ParsedVersion(std::string application, std::optional<std::string> version,
+ std::optional<std::string> app_build_hash);
+
+ const std::string& application() const { return _application; }
+
+ const std::optional<std::string>& version() const { return _version; }
+
+ const std::optional<std::string>& app_build_hash() const { return
_app_build_hash; }
+
+ bool operator==(const ParsedVersion& other) const;
+
+ bool operator!=(const ParsedVersion& other) const;
+
+ size_t hash() const;
+
+ std::string to_string() const;
+
+private:
+ std::string _application;
+ std::optional<std::string> _version;
+ std::optional<std::string> _app_build_hash;
+};
+
+class VersionParser {
+public:
+ static Status parse(const std::string& created_by,
+ std::unique_ptr<ParsedVersion>* parsed_version);
+};
+
+class SemanticVersion {
+public:
+ SemanticVersion(int major, int minor, int patch);
+
+#ifdef BE_TEST
+ SemanticVersion(int major, int minor, int patch, bool has_unknown);
+#endif
+
+ SemanticVersion(int major, int minor, int patch,
std::optional<std::string> unknown,
+ std::optional<std::string> pre, std::optional<std::string>
build_info);
+
+ static Status parse(const std::string& version,
+ std::unique_ptr<SemanticVersion>* semantic_version);
+
+ int compare_to(const SemanticVersion& other) const;
+
+ bool operator==(const SemanticVersion& other) const;
+
+ bool operator!=(const SemanticVersion& other) const;
+
+ std::string to_string() const;
+
+private:
+ class NumberOrString {
+ public:
+ explicit NumberOrString(const std::string& value_string);
+
+ NumberOrString(const NumberOrString& other);
+
+ int compare_to(const NumberOrString& that) const;
+ std::string to_string() const;
+
+ bool operator<(const NumberOrString& that) const;
+ bool operator==(const NumberOrString& that) const;
+ bool operator!=(const NumberOrString& that) const;
+ bool operator>(const NumberOrString& that) const;
+ bool operator<=(const NumberOrString& that) const;
+ bool operator>=(const NumberOrString& that) const;
+
+ private:
+ std::string _original;
+ bool _is_numeric;
+ int _number;
+ };
+
+ class Prerelease {
+ public:
+ explicit Prerelease(std::string original);
+
+ int compare_to(const Prerelease& that) const;
+ std::string to_string() const;
+
+ bool operator<(const Prerelease& that) const;
+ bool operator==(const Prerelease& that) const;
+ bool operator!=(const Prerelease& that) const;
+ bool operator>(const Prerelease& that) const;
+ bool operator<=(const Prerelease& that) const;
+ bool operator>=(const Prerelease& that) const;
+
+ const std::string& original() const { return _original; }
+
+ private:
+ static std::vector<std::string> _split(const std::string& s, const
std::regex& delimiter);
+
+ std::string _original;
+ std::vector<NumberOrString> _identifiers;
+ };
+
+ static int _compare_integers(int x, int y);
+ static int _compare_booleans(bool x, bool y);
+
+ int _major;
+ int _minor;
+ int _patch;
+ bool _prerelease;
+ std::optional<std::string> _unknown;
+ std::optional<Prerelease> _pre;
+ std::optional<std::string> _build_info;
+};
+
+class CorruptStatistics {
+public:
+ static bool should_ignore_statistics(const std::string& created_by,
+ tparquet::Type::type physical_type);
+
+private:
+ static const SemanticVersion PARQUET_251_FIXED_VERSION;
+ static const SemanticVersion CDH_5_PARQUET_251_FIXED_START;
+ static const SemanticVersion CDH_5_PARQUET_251_FIXED_END;
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
index 916f3f64ee6..316cbc5d716 100644
--- a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
+++ b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
@@ -17,6 +17,7 @@
#pragma once
+#include <cmath>
#include <cstring>
#include <vector>
@@ -38,9 +39,7 @@ class ParquetPredicate {
M(TYPE_TINYINT, tparquet::Type::INT32) \
M(TYPE_SMALLINT, tparquet::Type::INT32) \
M(TYPE_INT, tparquet::Type::INT32) \
- M(TYPE_BIGINT, tparquet::Type::INT64) \
- M(TYPE_FLOAT, tparquet::Type::FLOAT) \
- M(TYPE_DOUBLE, tparquet::Type::DOUBLE)
+ M(TYPE_BIGINT, tparquet::Type::INT64)
private:
struct ScanPredicate {
@@ -132,6 +131,8 @@ private:
CppType min_value;
CppType max_value;
+ std::unique_ptr<std::string> encoded_min_copy;
+ std::unique_ptr<std::string> encoded_max_copy;
tparquet::Type::type physical_type = col_schema->physical_type;
switch (col_val_range.type()) {
#define DISPATCH(REINTERPRET_TYPE, PARQUET_TYPE) \
@@ -142,24 +143,69 @@ private:
break;
FOR_REINTERPRET_TYPES(DISPATCH)
#undef DISPATCH
+ case TYPE_FLOAT:
+ if constexpr (std::is_same_v<CppType, float>) {
+ if (col_schema->physical_type != tparquet::Type::FLOAT) {
+ return false;
+ }
+ min_value = *reinterpret_cast<const
CppType*>(encoded_min.data());
+ max_value = *reinterpret_cast<const
CppType*>(encoded_max.data());
+ if (std::isnan(min_value) || std::isnan(max_value)) {
+ return false;
+ }
+ // Updating min to -0.0 and max to +0.0 to ensure that no 0.0
values would be skipped
+ if (std::signbit(min_value) == 0 && min_value == 0.0F) {
+ min_value = -0.0F;
+ }
+ if (std::signbit(max_value) != 0 && max_value == -0.0F) {
+ max_value = 0.0F;
+ }
+ break;
+ } else {
+ return false;
+ }
+ case TYPE_DOUBLE:
+ if constexpr (std::is_same_v<CppType, float>) {
+ if (col_schema->physical_type != tparquet::Type::DOUBLE) {
+ return false;
+ }
+ min_value = *reinterpret_cast<const
CppType*>(encoded_min.data());
+ max_value = *reinterpret_cast<const
CppType*>(encoded_max.data());
+ if (std::isnan(min_value) || std::isnan(max_value)) {
+ return false;
+ }
+ // Updating min to -0.0 and max to +0.0 to ensure that no 0.0
values would be skipped
+ if (std::signbit(min_value) == 0 && min_value == 0.0) {
+ min_value = -0.0;
+ }
+ if (std::signbit(max_value) != 0 && max_value == -0.0) {
+ max_value = 0.0;
+ }
+ break;
+ } else {
+ return false;
+ }
case TYPE_VARCHAR:
[[fallthrough]];
case TYPE_CHAR:
[[fallthrough]];
case TYPE_STRING:
- // TODO: In parquet, min and max statistics may not be able to
handle UTF8 correctly.
- // Current processing method is using min_value and max_value
statistics introduced by PARQUET-1025 if they are used.
- // If not, current processing method is temporarily ignored. A
better way is try to read min and max statistics
- // if it contains only ASCII characters.
- if (!use_min_max_value) {
- return false;
- }
if constexpr (std::is_same_v<CppType, StringRef>) {
- min_value = StringRef(encoded_min);
- max_value = StringRef(encoded_max);
+ if (!use_min_max_value) {
+ encoded_min_copy =
std::make_unique<std::string>(encoded_min);
+ encoded_max_copy =
std::make_unique<std::string>(encoded_max);
+ if (!_try_read_old_utf8_stats(*encoded_min_copy,
*encoded_max_copy)) {
+ return false;
+ }
+ min_value = StringRef(*encoded_min_copy);
+ max_value = StringRef(*encoded_max_copy);
+ } else {
+ min_value = StringRef(encoded_min);
+ max_value = StringRef(encoded_max);
+ }
} else {
return false;
- };
+ }
break;
case TYPE_DECIMALV2:
if constexpr (std::is_same_v<CppType, DecimalV2Value>) {
@@ -397,9 +443,64 @@ private:
return predicates;
}
+ static inline bool _is_ascii(uint8_t byte) { return byte < 128; }
+
+ static int _common_prefix(const std::string& encoding_min, const
std::string& encoding_max) {
+ int min_length = std::min(encoding_min.size(), encoding_max.size());
+ int common_length = 0;
+ while (common_length < min_length &&
+ encoding_min[common_length] == encoding_max[common_length]) {
+ common_length++;
+ }
+ return common_length;
+ }
+
+ static bool _try_read_old_utf8_stats(std::string& encoding_min,
std::string& encoding_max) {
+ if (encoding_min == encoding_max) {
+ // If min = max, then there is a single value only
+ // No need to modify, just use min
+ encoding_max = encoding_min;
+ return true;
+ } else {
+ int common_prefix_length = _common_prefix(encoding_min,
encoding_max);
+
+ // For min we can retain all-ASCII, because this produces a
strictly lower value.
+ int min_good_length = common_prefix_length;
+ while (min_good_length < encoding_min.size() &&
+
_is_ascii(static_cast<uint8_t>(encoding_min[min_good_length]))) {
+ min_good_length++;
+ }
+
+ // For max we can be sure only of the part matching the min. When
they differ, we can consider only one next, and only if both are ASCII
+ int max_good_length = common_prefix_length;
+ if (max_good_length < encoding_max.size() && max_good_length <
encoding_min.size() &&
+ _is_ascii(static_cast<uint8_t>(encoding_min[max_good_length]))
&&
+
_is_ascii(static_cast<uint8_t>(encoding_max[max_good_length]))) {
+ max_good_length++;
+ }
+ // Incrementing 127 would overflow. Incrementing within non-ASCII
can have side-effects.
+ while (max_good_length > 0 &&
+ (static_cast<uint8_t>(encoding_max[max_good_length - 1]) ==
127 ||
+
!_is_ascii(static_cast<uint8_t>(encoding_max[max_good_length - 1])))) {
+ max_good_length--;
+ }
+ if (max_good_length == 0) {
+ // We can return just min bound, but code downstream likely
expects both are present or both are absent.
+ return false;
+ }
+
+ encoding_min.resize(min_good_length);
+ encoding_max.resize(max_good_length);
+ if (max_good_length > 0) {
+ encoding_max[max_good_length - 1]++;
+ }
+ return true;
+ }
+ }
+
public:
static bool filter_by_stats(const ColumnValueRangeType& col_val_range,
- const FieldSchema* col_schema, bool
is_set_min_max,
+ const FieldSchema* col_schema, bool
ignore_min_max_stats,
const std::string& encoded_min, const
std::string& encoded_max,
bool is_all_null, const cctz::time_zone& ctz,
bool use_min_max_value = false) {
@@ -416,11 +517,14 @@ public:
return;
}
}
- for (auto& filter : filters) {
- need_filter |= _filter_by_min_max(range, filter,
col_schema, encoded_min,
- encoded_max, ctz,
use_min_max_value);
- if (need_filter) {
- break;
+ if (!ignore_min_max_stats) {
+ for (auto& filter : filters) {
+ need_filter |=
+ _filter_by_min_max(range, filter,
col_schema, encoded_min,
+ encoded_max, ctz,
use_min_max_value);
+ if (need_filter) {
+ break;
+ }
}
}
},
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp
b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp
index 35cf076318e..53fb1579c8e 100644
--- a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp
@@ -68,7 +68,7 @@ Status
PageIndex::collect_skipped_page_range(tparquet::ColumnIndex* column_index
const int num_of_pages = column_index->null_pages.size();
for (int page_id = 0; page_id < num_of_pages; page_id++) {
bool is_all_null = column_index->null_pages[page_id];
- if (ParquetPredicate::filter_by_stats(col_val_range, col_schema,
!is_all_null,
+ if (ParquetPredicate::filter_by_stats(col_val_range, col_schema, false,
encoded_min_vals[page_id],
encoded_max_vals[page_id],
is_all_null, ctz)) {
skipped_ranges.emplace_back(page_id);
@@ -125,4 +125,4 @@ Status PageIndex::parse_offset_index(const
tparquet::ColumnChunk& chunk, const u
return Status::OK();
}
-} // namespace doris::vectorized
\ No newline at end of file
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index 57396c349dd..84c572a3a2f 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -938,15 +938,53 @@ Status ParquetReader::_process_column_stat_filter(const
std::vector<tparquet::Co
continue;
}
const FieldSchema* col_schema = schema_desc.get_column(col_name);
+ bool ignore_min_max_stats = false;
// Min-max of statistic is plain-encoded value
- if (statistic.__isset.min_value) {
+ if (statistic.__isset.min_value && statistic.__isset.max_value) {
+ ColumnOrderName column_order =
+ col_schema->physical_type == tparquet::Type::INT96 ||
+
col_schema->parquet_schema.logicalType.__isset.UNKNOWN
+ ? ColumnOrderName::UNDEFINED
+ : ColumnOrderName::TYPE_DEFINED_ORDER;
+ if ((statistic.min_value != statistic.max_value) &&
+ (column_order != ColumnOrderName::TYPE_DEFINED_ORDER)) {
+ ignore_min_max_stats = true;
+ }
*filter_group = ParquetPredicate::filter_by_stats(
- slot_iter->second, col_schema, is_set_min_max,
statistic.min_value,
+ slot_iter->second, col_schema, ignore_min_max_stats,
statistic.min_value,
statistic.max_value, is_all_null, *_ctz, true);
} else {
+ if (statistic.__isset.min && statistic.__isset.max) {
+ bool max_equals_min = statistic.min == statistic.max;
+
+ SortOrder sort_order =
_determine_sort_order(col_schema->parquet_schema);
+ bool sort_orders_match = SortOrder::SIGNED == sort_order;
+ if (!sort_orders_match && !max_equals_min) {
+ ignore_min_max_stats = true;
+ }
+ bool should_ignore_corrupted_stats = false;
+ if (_ignored_stats.count(col_schema->physical_type) == 0) {
+ if
(CorruptStatistics::should_ignore_statistics(_t_metadata->created_by,
+
col_schema->physical_type)) {
+ _ignored_stats[col_schema->physical_type] = true;
+ should_ignore_corrupted_stats = true;
+ } else {
+ _ignored_stats[col_schema->physical_type] = false;
+ }
+ } else if (_ignored_stats[col_schema->physical_type]) {
+ should_ignore_corrupted_stats = true;
+ }
+ if (should_ignore_corrupted_stats) {
+ ignore_min_max_stats = true;
+ } else if (!sort_orders_match && !max_equals_min) {
+ ignore_min_max_stats = true;
+ }
+ } else {
+ ignore_min_max_stats = true;
+ }
*filter_group = ParquetPredicate::filter_by_stats(
- slot_iter->second, col_schema, is_set_min_max,
statistic.min, statistic.max,
- is_all_null, *_ctz, false);
+ slot_iter->second, col_schema, ignore_min_max_stats,
statistic.min,
+ statistic.max, is_all_null, *_ctz, false);
}
if (*filter_group) {
break;
@@ -1021,4 +1059,61 @@ void ParquetReader::_collect_profile_before_close() {
_collect_profile();
}
+SortOrder ParquetReader::_determine_sort_order(const tparquet::SchemaElement&
parquet_schema) {
+ tparquet::Type::type physical_type = parquet_schema.type;
+ const tparquet::LogicalType& logical_type = parquet_schema.logicalType;
+
+ // Assume string type is SortOrder::SIGNED, use
ParquetPredicate::_try_read_old_utf8_stats() to handle it.
+ if (logical_type.__isset.STRING && (physical_type ==
tparquet::Type::BYTE_ARRAY ||
+ physical_type ==
tparquet::Type::FIXED_LEN_BYTE_ARRAY)) {
+ return SortOrder::SIGNED;
+ }
+
+ if (logical_type.__isset.INTEGER) {
+ if (logical_type.INTEGER.isSigned) {
+ return SortOrder::SIGNED;
+ } else {
+ return SortOrder::UNSIGNED;
+ }
+ } else if (logical_type.__isset.DATE) {
+ return SortOrder::SIGNED;
+ } else if (logical_type.__isset.ENUM) {
+ return SortOrder::UNSIGNED;
+ } else if (logical_type.__isset.BSON) {
+ return SortOrder::UNSIGNED;
+ } else if (logical_type.__isset.JSON) {
+ return SortOrder::UNSIGNED;
+ } else if (logical_type.__isset.STRING) {
+ return SortOrder::UNSIGNED;
+ } else if (logical_type.__isset.DECIMAL) {
+ return SortOrder::UNKNOWN;
+ } else if (logical_type.__isset.MAP) {
+ return SortOrder::UNKNOWN;
+ } else if (logical_type.__isset.LIST) {
+ return SortOrder::UNKNOWN;
+ } else if (logical_type.__isset.TIME) {
+ return SortOrder::SIGNED;
+ } else if (logical_type.__isset.TIMESTAMP) {
+ return SortOrder::SIGNED;
+ } else if (logical_type.__isset.UNKNOWN) {
+ return SortOrder::UNKNOWN;
+ } else {
+ switch (physical_type) {
+ case tparquet::Type::BOOLEAN:
+ case tparquet::Type::INT32:
+ case tparquet::Type::INT64:
+ case tparquet::Type::FLOAT:
+ case tparquet::Type::DOUBLE:
+ return SortOrder::SIGNED;
+ case tparquet::Type::BYTE_ARRAY:
+ case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
+ return SortOrder::UNSIGNED;
+ case tparquet::Type::INT96:
+ return SortOrder::UNKNOWN;
+ default:
+ return SortOrder::UNKNOWN;
+ }
+ }
+}
+
} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h
b/be/src/vec/exec/format/parquet/vparquet_reader.h
index 3cc262e14e6..9691e596b78 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -220,6 +220,8 @@ private:
const RowGroupReader::RowGroupIndex& group, size_t* avg_io_size);
void _collect_profile();
+ static SortOrder _determine_sort_order(const tparquet::SchemaElement&
parquet_schema);
+
private:
RuntimeProfile* _profile = nullptr;
const TFileScanRangeParams& _scan_params;
@@ -284,5 +286,6 @@ private:
const VExprContextSPtrs* _not_single_slot_filter_conjuncts = nullptr;
const std::unordered_map<int, VExprContextSPtrs>*
_slot_id_to_filter_conjuncts = nullptr;
bool _hive_use_column_names = false;
+ std::unordered_map<tparquet::Type::type, bool> _ignored_stats;
};
} // namespace doris::vectorized
diff --git a/be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp
b/be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp
new file mode 100644
index 00000000000..bad95614f00
--- /dev/null
+++ b/be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris {
+namespace vectorized {
+class ParquetCorruptStatisticsTest : public testing::Test {
+public:
+ ParquetCorruptStatisticsTest() = default;
+};
+
+TEST_F(ParquetCorruptStatisticsTest, test_only_applies_to_binary) {
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version 1.6.0 (build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version 1.6.0 (build abcd)",
+
tparquet::Type::FIXED_LEN_BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.6.0 (build abcd)", tparquet::Type::DOUBLE));
+}
+
+TEST_F(ParquetCorruptStatisticsTest, test_corrupt_statistics) {
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version 1.6.0 (build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version 1.4.2 (build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.6.100 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.7.999 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.6.22rc99 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.6.22rc99-SNAPSHOT (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.6.1-SNAPSHOT (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.6.0t-01-abcdefg (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("unparseable
string",
+
tparquet::Type::BYTE_ARRAY));
+
+ // missing semver
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version (build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version (build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+
+ // missing build hash
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version 1.6.0 (build )",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version 1.6.0 (build)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version (build)",
+
tparquet::Type::BYTE_ARRAY));
+
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("imapla version
1.6.0 (build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("imapla version
1.10.0 (build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.8.0 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.8.1 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.8.1rc3 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.8.1rc3-SNAPSHOT (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.9.0 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 2.0.0 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.9.0t-01-abcdefg (build abcd)",
tparquet::Type::BYTE_ARRAY));
+
+ // missing semver
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version
(build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version
(build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+
+ // missing build hash
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version
1.6.0 (build )",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version
1.6.0 (build)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version
(build)",
+
tparquet::Type::BYTE_ARRAY));
+}
+
+TEST_F(ParquetCorruptStatisticsTest, test_distribution_corrupt_statistics) {
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.5.0-cdh5.4.999 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.5.0-cdh5.5.0-SNAPSHOT (build "
+ "956ed6c14c611b4c4eaaa1d6e5b9a9c6d4dfa336)",
+ tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.5.0-cdh5.5.0 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.5.0-cdh5.5.1 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.5.0-cdh5.6.0 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+ "parquet-mr version 1.4.10 (build abcd)",
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version 1.5.0 (build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version 1.5.1 (build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version 1.6.0 (build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+ EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr
version 1.7.0 (build abcd)",
+
tparquet::Type::BYTE_ARRAY));
+}
+
+} // namespace vectorized
+} // namespace doris
diff --git a/be/test/vec/exec/parquet/parquet_statistics_test.cpp
b/be/test/vec/exec/parquet/parquet_statistics_test.cpp
new file mode 100644
index 00000000000..cd8d3068fe1
--- /dev/null
+++ b/be/test/vec/exec/parquet/parquet_statistics_test.cpp
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_pred_cmp.h"
+
+namespace doris {
+namespace vectorized {
+class ParquetStatisticsTest : public testing::Test {
+public:
+ ParquetStatisticsTest() = default;
+};
+
+TEST_F(ParquetStatisticsTest, test_try_read_old_utf8_stats) {
+ // [, bcé]: min is empty, max starts with ASCII
+ {
+ std::string encoding_min("");
+ std::string encoding_max("bcé");
+ EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ }
+
+ // // [, ébc]: min is empty, max starts with non-ASCII
+ {
+ std::string encoding_min("");
+ std::string encoding_max("ébc");
+ EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ }
+
+ // [aa, bé]: no common prefix, first different are both ASCII, min is all
ASCII
+ {
+ std::string encoding_min("aa");
+ std::string encoding_max("bé");
+ EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ EXPECT_EQ(encoding_min, "aa");
+ EXPECT_EQ(encoding_max, "c");
+ }
+
+ // [abcd, abcdN]: common prefix, not only ASCII, one prefix of the other,
last common ASCII
+ {
+ std::string encoding_min("abcd");
+ std::string encoding_max("abcdN");
+ EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ EXPECT_EQ(encoding_min, "abcd");
+ EXPECT_EQ(encoding_max, "abce");
+ }
+
+ // [abcé, abcéN]: common prefix, not only ASCII, one prefix of the other,
last common non ASCII
+ {
+ std::string encoding_min("abcé");
+ std::string encoding_max("abcéN");
+ EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ EXPECT_EQ(encoding_min, "abcé");
+ EXPECT_EQ(encoding_max, "abd");
+ }
+
+ // [abcéM, abcéN]: common prefix, not only ASCII, first different are both
ASCII
+ {
+ std::string encoding_min("abcéM");
+ std::string encoding_max("abcéN");
+ EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ EXPECT_EQ(encoding_min, "abcéM");
+ EXPECT_EQ(encoding_max, "abcéO");
+ }
+
+ // [abcéMab, abcéNxy]: common prefix, not only ASCII, first different are
both ASCII, more characters afterwards
+ {
+ std::string encoding_min("abcéMab");
+ std::string encoding_max("abcéNxy");
+ EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ EXPECT_EQ(encoding_min, "abcéMab");
+ EXPECT_EQ(encoding_max, "abcéO");
+ }
+
+ // [abcéM, abcé\u00f7]: common prefix, not only ASCII, first different are
both ASCII, but need to be chopped off (127)
+ {
+ std::string encoding_min("abcéM");
+ std::string encoding_max("abcé\u00f7");
+ EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ EXPECT_EQ(encoding_min, "abcéM");
+ EXPECT_EQ(encoding_max, "abd");
+ }
+
+ // [abc\u007fé, bcd\u007fé]: no common prefix, first different are both
ASCII
+ {
+ std::string encoding_min("abc\u007fé");
+ std::string encoding_max("bcd\u007fé");
+ EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ EXPECT_EQ(encoding_min, "abc\u007f");
+ EXPECT_EQ(encoding_max, "c");
+ }
+
+ // [é, a]: no common prefix, first different are not both ASCII
+ {
+ std::string encoding_min("é");
+ std::string encoding_max("a");
+ EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ }
+
+ // [é, ê]: no common prefix, first different are both not ASCII
+ {
+ std::string encoding_min("é");
+ std::string encoding_max("ê");
+ EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ }
+
+ // [aé, aé]: min = max (common prefix, first different are both not ASCII)
+ {
+ std::string encoding_min("aé");
+ std::string encoding_max("aé");
+ EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ EXPECT_EQ(encoding_min, "aé");
+ EXPECT_EQ(encoding_max, "aé");
+ }
+
+ // [aé, bé]: no common prefix, first different are both ASCII
+ {
+ std::string encoding_min("aé");
+ std::string encoding_max("bé");
+ EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min,
encoding_max));
+ ;
+ EXPECT_EQ(encoding_min, "a");
+ EXPECT_EQ(encoding_max, "c");
+ }
+}
+
+} // namespace vectorized
+} // namespace doris
diff --git a/be/test/vec/exec/parquet/parquet_version_test.cpp
b/be/test/vec/exec/parquet/parquet_version_test.cpp
new file mode 100644
index 00000000000..10d17e27790
--- /dev/null
+++ b/be/test/vec/exec/parquet/parquet_version_test.cpp
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris {
+namespace vectorized {
+class ParquetVersionTest : public testing::Test {
+public:
+ ParquetVersionTest() = default;
+};
+
+TEST_F(ParquetVersionTest, test_version_parser) {
+ std::unique_ptr<ParsedVersion> parsed_version;
+
+ Status status = VersionParser::parse("parquet-mr version 1.6.0 (build
abcd)", &parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", "abcd"), *parsed_version);
+
+ status = VersionParser::parse("parquet-mr version 1.6.22rc99-SNAPSHOT
(build abcd)",
+ &parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.22rc99-SNAPSHOT", "abcd"),
*parsed_version);
+
+ status = VersionParser::parse("unparseable string", &parsed_version);
+ EXPECT_FALSE(status.ok());
+
+ // missing semver
+ status = VersionParser::parse("parquet-mr version (build abcd)",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, "abcd"),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version (build abcd)",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, "abcd"),
*parsed_version);
+
+ // missing build hash
+ status = VersionParser::parse("parquet-mr version 1.6.0 (build )",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version 1.6.0 (build)",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version (build)",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version (build )",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt),
*parsed_version);
+
+ // Missing entire build section
+ status = VersionParser::parse("parquet-mr version 1.6.0", &parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version 1.8.0rc4",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4", std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version 1.8.0rc4-SNAPSHOT",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4-SNAPSHOT", std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version", &parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt),
*parsed_version);
+
+ // Various spaces
+ status = VersionParser::parse("parquet-mr version 1.6.0",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version 1.8.0rc4",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4", std::nullopt),
*parsed_version);
+
+ status =
+ VersionParser::parse("parquet-mr version 1.8.0rc4-SNAPSHOT
", &parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4-SNAPSHOT", std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version", &parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version 1.6.0 ( build )",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version 1.6.0 ( build)",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version ( build)",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt),
*parsed_version);
+
+ status = VersionParser::parse("parquet-mr version (build )",
&parsed_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt),
*parsed_version);
+}
+
+void assertLessThan(const std::string& a, const std::string& b) {
+ std::unique_ptr<SemanticVersion> version_a;
+ Status status = SemanticVersion::parse(a, &version_a);
+ EXPECT_TRUE(status.ok());
+ std::unique_ptr<SemanticVersion> version_b;
+ status = SemanticVersion::parse(b, &version_b);
+ EXPECT_TRUE(status.ok());
+ EXPECT_LT(version_a->compare_to(*version_b), 0) << a << " should be < " <<
b;
+ EXPECT_GT(version_b->compare_to(*version_a), 0) << b << " should be > " <<
a;
+}
+
+void assertEqualTo(const std::string& a, const std::string& b) {
+ std::unique_ptr<SemanticVersion> version_a;
+ Status status = SemanticVersion::parse(a, &version_a);
+ EXPECT_TRUE(status.ok());
+ std::unique_ptr<SemanticVersion> version_b;
+ status = SemanticVersion::parse(b, &version_b);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(version_a->compare_to(*version_b), 0) << a << " should equal "
<< b;
+}
+
+TEST_F(ParquetVersionTest, test_compare) {
+ EXPECT_EQ(SemanticVersion(1, 8, 1).compare_to(SemanticVersion(1, 8, 1)),
0);
+ EXPECT_LT(SemanticVersion(1, 8, 0).compare_to(SemanticVersion(1, 8, 1)),
0);
+ EXPECT_GT(SemanticVersion(1, 8, 2).compare_to(SemanticVersion(1, 8, 1)),
0);
+
+ EXPECT_EQ(SemanticVersion(1, 8, 1).compare_to(SemanticVersion(1, 8, 1)),
0);
+ EXPECT_LT(SemanticVersion(1, 8, 0).compare_to(SemanticVersion(1, 8, 1)),
0);
+ EXPECT_GT(SemanticVersion(1, 8, 2).compare_to(SemanticVersion(1, 8, 1)),
0);
+
+ EXPECT_LT(SemanticVersion(1, 7, 0).compare_to(SemanticVersion(1, 8, 0)),
0);
+ EXPECT_GT(SemanticVersion(1, 9, 0).compare_to(SemanticVersion(1, 8, 0)),
0);
+
+ EXPECT_LT(SemanticVersion(0, 0, 0).compare_to(SemanticVersion(1, 0, 0)),
0);
+ EXPECT_GT(SemanticVersion(2, 0, 0).compare_to(SemanticVersion(1, 0, 0)),
0);
+
+ EXPECT_LT(SemanticVersion(1, 8, 100).compare_to(SemanticVersion(1, 9, 0)),
0);
+
+ EXPECT_GT(SemanticVersion(1, 8, 0).compare_to(SemanticVersion(1, 8, 0,
true)), 0);
+ EXPECT_EQ(SemanticVersion(1, 8, 0, true).compare_to(SemanticVersion(1, 8,
0, true)), 0);
+ EXPECT_LT(SemanticVersion(1, 8, 0, true).compare_to(SemanticVersion(1, 8,
0)), 0);
+}
+
+TEST_F(ParquetVersionTest, test_semver_prerelease_examples) {
+ std::vector<std::string> examples = {"1.0.0-alpha", "1.0.0-alpha.1",
"1.0.0-alpha.beta",
+ "1.0.0-beta", "1.0.0-beta.2",
"1.0.0-beta.11",
+ "1.0.0-rc.1", "1.0.0"};
+ for (size_t i = 0; i < examples.size() - 1; ++i) {
+ assertLessThan(examples[i], examples[i + 1]);
+ assertEqualTo(examples[i], examples[i]);
+ }
+ assertEqualTo(examples.back(), examples.back());
+}
+
+TEST_F(ParquetVersionTest, test_semver_build_info_examples) {
+ assertEqualTo("1.0.0-alpha+001", "1.0.0-alpha+001");
+ assertEqualTo("1.0.0-alpha", "1.0.0-alpha+001");
+ assertEqualTo("1.0.0+20130313144700", "1.0.0+20130313144700");
+ assertEqualTo("1.0.0", "1.0.0+20130313144700");
+ assertEqualTo("1.0.0-beta+exp.sha.5114f85", "1.0.0-beta+exp.sha.5114f85");
+ assertEqualTo("1.0.0-beta", "1.0.0-beta+exp.sha.5114f85");
+}
+
+TEST_F(ParquetVersionTest, test_unknown_comparisons) {
+ assertLessThan("1.0.0rc0-alpha+001", "1.0.0-alpha");
+}
+
+TEST_F(ParquetVersionTest, test_distribution_versions) {
+ assertEqualTo("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.0");
+ assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.1");
+ assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.1-SNAPSHOT");
+ assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.6.0");
+ assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh6.0.0");
+ assertLessThan("1.5.0-cdh5.5.0", "1.5.0");
+ assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.0-SNAPSHOT");
+}
+
+TEST_F(ParquetVersionTest, test_parse) {
+ std::unique_ptr<SemanticVersion> semantic_version;
+ Status status = SemanticVersion::parse("1.8.0", &semantic_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0));
+ status = SemanticVersion::parse("1.8.0rc3", &semantic_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0, true));
+ status = SemanticVersion::parse("1.8.0rc3-SNAPSHOT", &semantic_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0, "rc3", "SNAPSHOT",
std::nullopt));
+ status = SemanticVersion::parse("1.8.0-SNAPSHOT", &semantic_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0, std::nullopt,
"SNAPSHOT", std::nullopt));
+ status = SemanticVersion::parse("1.5.0-cdh5.5.0", &semantic_version);
+ EXPECT_TRUE(status.ok());
+ EXPECT_EQ(*semantic_version, SemanticVersion(1, 5, 0, std::nullopt,
"cdh5.5.0", std::nullopt));
+}
+
+} // namespace vectorized
+} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]