(doris) branch branch-2.1 updated: [Fix](parquet-reader) Fix and optimize parquet min-max filtering. (#39375)

yiguolei Wed, 14 Aug 2024 23:13:08 -0700

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new a44a2745634 [Fix](parquet-reader) Fix and optimize parquet min-max 
filtering.  (#39375)
a44a2745634 is described below

commit a44a2745634a089581c99d394f1e333e3ebcb400
Author: Qi Chen <[email protected]>
AuthorDate: Thu Aug 15 14:12:54 2024 +0800

    [Fix](parquet-reader) Fix and optimize parquet min-max filtering.  (#39375)
    
    Backport #38277.
---
 be/src/vec/exec/format/parquet/parquet_common.cpp  | 340 +++++++++++++++++++++
 be/src/vec/exec/format/parquet/parquet_common.h    | 131 +++++++-
 be/src/vec/exec/format/parquet/parquet_pred_cmp.h  | 142 +++++++--
 .../exec/format/parquet/vparquet_page_index.cpp    |   4 +-
 be/src/vec/exec/format/parquet/vparquet_reader.cpp | 103 ++++++-
 be/src/vec/exec/format/parquet/vparquet_reader.h   |   3 +
 .../parquet/parquet_corrupt_statistics_test.cpp    | 134 ++++++++
 .../vec/exec/parquet/parquet_statistics_test.cpp   | 155 ++++++++++
 be/test/vec/exec/parquet/parquet_version_test.cpp  | 221 ++++++++++++++
 9 files changed, 1207 insertions(+), 26 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/parquet_common.cpp 
b/be/src/vec/exec/format/parquet/parquet_common.cpp
index 33e9f11242b..59e12fcc71a 100644
--- a/be/src/vec/exec/format/parquet/parquet_common.cpp
+++ b/be/src/vec/exec/format/parquet/parquet_common.cpp
@@ -162,4 +162,344 @@ bool ColumnSelectVector::can_filter_all(size_t 
remaining_num_values) {
 void ColumnSelectVector::skip(size_t num_values) {
     _filter_map_index += num_values;
 }
+
+ParsedVersion::ParsedVersion(std::string application, 
std::optional<std::string> version,
+                             std::optional<std::string> app_build_hash)
+        : _application(std::move(application)),
+          _version(std::move(version)),
+          _app_build_hash(std::move(app_build_hash)) {}
+
+bool ParsedVersion::operator==(const ParsedVersion& other) const {
+    return _application == other._application && _version == other._version &&
+           _app_build_hash == other._app_build_hash;
+}
+
+bool ParsedVersion::operator!=(const ParsedVersion& other) const {
+    return !(*this == other);
+}
+
+size_t ParsedVersion::hash() const {
+    std::hash<std::string> hasher;
+    return hasher(_application) ^ (_version ? hasher(*_version) : 0) ^
+           (_app_build_hash ? hasher(*_app_build_hash) : 0);
+}
+
+std::string ParsedVersion::to_string() const {
+    return "ParsedVersion(application=" + _application +
+           ", semver=" + (_version ? *_version : "null") +
+           ", app_build_hash=" + (_app_build_hash ? *_app_build_hash : "null") 
+ ")";
+}
+
+Status VersionParser::parse(const std::string& created_by,
+                            std::unique_ptr<ParsedVersion>* parsed_version) {
+    static const std::string FORMAT =
+            
"(.*?)\\s+version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?";
+    static const std::regex PATTERN(FORMAT);
+
+    std::smatch matcher;
+    if (!std::regex_match(created_by, matcher, PATTERN)) {
+        return Status::InternalError(fmt::format("Could not parse created_by: 
{}, using format: {}",
+                                                 created_by, FORMAT));
+    }
+
+    std::string application = matcher[1].str();
+    if (application.empty()) {
+        return Status::InternalError("application cannot be null or empty");
+    }
+    std::optional<std::string> semver =
+            matcher[2].str().empty() ? std::nullopt : 
std::optional<std::string>(matcher[2].str());
+    std::optional<std::string> app_build_hash =
+            matcher[3].str().empty() ? std::nullopt : 
std::optional<std::string>(matcher[3].str());
+    *parsed_version = std::make_unique<ParsedVersion>(application, semver, 
app_build_hash);
+    return Status::OK();
+}
+
+SemanticVersion::SemanticVersion(int major, int minor, int patch)
+        : _major(major),
+          _minor(minor),
+          _patch(patch),
+          _prerelease(false),
+          _unknown(std::nullopt),
+          _pre(std::nullopt),
+          _build_info(std::nullopt) {}
+
+#ifdef BE_TEST
+SemanticVersion::SemanticVersion(int major, int minor, int patch, bool 
has_unknown)
+        : _major(major),
+          _minor(minor),
+          _patch(patch),
+          _prerelease(has_unknown),
+          _unknown(std::nullopt),
+          _pre(std::nullopt),
+          _build_info(std::nullopt) {}
+#endif
+
+SemanticVersion::SemanticVersion(int major, int minor, int patch,
+                                 std::optional<std::string> unknown, 
std::optional<std::string> pre,
+                                 std::optional<std::string> build_info)
+        : _major(major),
+          _minor(minor),
+          _patch(patch),
+          _prerelease(unknown.has_value() && !unknown.value().empty()),
+          _unknown(std::move(unknown)),
+          _pre(pre.has_value() ? 
std::optional<Prerelease>(Prerelease(std::move(pre.value())))
+                               : std::nullopt),
+          _build_info(std::move(build_info)) {}
+
+Status SemanticVersion::parse(const std::string& version,
+                              std::unique_ptr<SemanticVersion>* 
semantic_version) {
+    static const std::regex 
pattern(R"(^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$)");
+    std::smatch match;
+
+    if (!std::regex_match(version, match, pattern)) {
+        return Status::InternalError(version + " does not match format");
+    }
+
+    int major = std::stoi(match[1].str());
+    int minor = std::stoi(match[2].str());
+    int patch = std::stoi(match[3].str());
+    std::optional<std::string> unknown =
+            match[4].str().empty() ? std::nullopt : 
std::optional<std::string>(match[4].str());
+    std::optional<std::string> prerelease =
+            match[5].str().empty() ? std::nullopt : 
std::optional<std::string>(match[5].str());
+    std::optional<std::string> build_info =
+            match[6].str().empty() ? std::nullopt : 
std::optional<std::string>(match[6].str());
+    if (major < 0 || minor < 0 || patch < 0) {
+        return Status::InternalError("major({}), minor({}), and patch({}) must 
all be >= 0", major,
+                                     minor, patch);
+    }
+    *semantic_version =
+            std::make_unique<SemanticVersion>(major, minor, patch, unknown, 
prerelease, build_info);
+    return Status::OK();
+}
+
+int SemanticVersion::compare_to(const SemanticVersion& other) const {
+    if (int cmp = _compare_integers(_major, other._major); cmp != 0) {
+        return cmp;
+    }
+    if (int cmp = _compare_integers(_minor, other._minor); cmp != 0) {
+        return cmp;
+    }
+    if (int cmp = _compare_integers(_patch, other._patch); cmp != 0) {
+        return cmp;
+    }
+    if (int cmp = _compare_booleans(other._prerelease, _prerelease); cmp != 0) 
{
+        return cmp;
+    }
+    if (_pre.has_value()) {
+        if (other._pre.has_value()) {
+            return _pre.value().compare_to(other._pre.value());
+        } else {
+            return -1;
+        }
+    } else if (other._pre.has_value()) {
+        return 1;
+    }
+    return 0;
+}
+
+bool SemanticVersion::operator==(const SemanticVersion& other) const {
+    return compare_to(other) == 0;
+}
+
+bool SemanticVersion::operator!=(const SemanticVersion& other) const {
+    return !(*this == other);
+}
+
+std::string SemanticVersion::to_string() const {
+    std::string result =
+            std::to_string(_major) + "." + std::to_string(_minor) + "." + 
std::to_string(_patch);
+    if (_prerelease && _unknown) result += _unknown.value();
+    if (_pre) result += _pre.value().to_string();
+    if (_build_info) result += _build_info.value();
+    return result;
+}
+
+SemanticVersion::NumberOrString::NumberOrString(const std::string& 
value_string)
+        : _original(value_string) {
+    const static std::regex NUMERIC("\\d+");
+    _is_numeric = std::regex_match(_original, NUMERIC);
+    _number = -1;
+    if (_is_numeric) {
+        _number = std::stoi(_original);
+    }
+}
+
+SemanticVersion::NumberOrString::NumberOrString(const NumberOrString& other)
+        : _original(other._original), _is_numeric(other._is_numeric), 
_number(other._number) {}
+
+int SemanticVersion::NumberOrString::compare_to(const 
SemanticVersion::NumberOrString& that) const {
+    if (this->_is_numeric != that._is_numeric) {
+        return this->_is_numeric ? -1 : 1;
+    }
+
+    if (_is_numeric) {
+        return this->_number - that._number;
+    }
+
+    return this->_original.compare(that._original);
+}
+
+std::string SemanticVersion::NumberOrString::to_string() const {
+    return _original;
+}
+
+bool SemanticVersion::NumberOrString::operator<(const 
SemanticVersion::NumberOrString& that) const {
+    return compare_to(that) < 0;
+}
+
+bool SemanticVersion::NumberOrString::operator==(
+        const SemanticVersion::NumberOrString& that) const {
+    return compare_to(that) == 0;
+}
+
+bool SemanticVersion::NumberOrString::operator!=(
+        const SemanticVersion::NumberOrString& that) const {
+    return !(*this == that);
+}
+
+bool SemanticVersion::NumberOrString::operator>(const 
SemanticVersion::NumberOrString& that) const {
+    return compare_to(that) > 0;
+}
+
+bool SemanticVersion::NumberOrString::operator<=(
+        const SemanticVersion::NumberOrString& that) const {
+    return !(*this > that);
+}
+
+bool SemanticVersion::NumberOrString::operator>=(
+        const SemanticVersion::NumberOrString& that) const {
+    return !(*this < that);
+}
+
+int SemanticVersion::_compare_integers(int x, int y) {
+    return (x < y) ? -1 : ((x == y) ? 0 : 1);
+}
+
+int SemanticVersion::_compare_booleans(bool x, bool y) {
+    return (x == y) ? 0 : (x ? 1 : -1);
+}
+
+std::vector<std::string> SemanticVersion::Prerelease::_split(const 
std::string& s,
+                                                             const std::regex& 
delimiter) {
+    std::sregex_token_iterator iter(s.begin(), s.end(), delimiter, -1);
+    std::sregex_token_iterator end;
+    std::vector<std::string> tokens(iter, end);
+    return tokens;
+}
+
+SemanticVersion::Prerelease::Prerelease(std::string original) : 
_original(std::move(original)) {
+    static const std::regex DOT("\\.");
+    auto parts = _split(_original, DOT);
+    for (const auto& part : parts) {
+        NumberOrString number_or_string(part);
+        _identifiers.emplace_back(number_or_string);
+    }
+}
+
+int SemanticVersion::Prerelease::compare_to(const Prerelease& that) const {
+    int size = std::min(this->_identifiers.size(), that._identifiers.size());
+    for (int i = 0; i < size; ++i) {
+        int cmp = this->_identifiers[i].compare_to(that._identifiers[i]);
+        if (cmp != 0) {
+            return cmp;
+        }
+    }
+    return static_cast<int>(this->_identifiers.size()) - 
static_cast<int>(that._identifiers.size());
+}
+
+std::string SemanticVersion::Prerelease::to_string() const {
+    return _original;
+}
+
+bool SemanticVersion::Prerelease::operator<(const Prerelease& that) const {
+    return compare_to(that) < 0;
+}
+
+bool SemanticVersion::Prerelease::operator==(const Prerelease& that) const {
+    return compare_to(that) == 0;
+}
+
+bool SemanticVersion::Prerelease::operator!=(const Prerelease& that) const {
+    return !(*this == that);
+}
+
+bool SemanticVersion::Prerelease::operator>(const Prerelease& that) const {
+    return compare_to(that) > 0;
+}
+
+bool SemanticVersion::Prerelease::operator<=(const Prerelease& that) const {
+    return !(*this > that);
+}
+
+bool SemanticVersion::Prerelease::operator>=(const Prerelease& that) const {
+    return !(*this < that);
+}
+
+const SemanticVersion CorruptStatistics::PARQUET_251_FIXED_VERSION(1, 8, 0);
+const SemanticVersion CorruptStatistics::CDH_5_PARQUET_251_FIXED_START(1, 5, 
0, std::nullopt,
+                                                                       
"cdh5.5.0", std::nullopt);
+const SemanticVersion CorruptStatistics::CDH_5_PARQUET_251_FIXED_END(1, 5, 0);
+
+bool CorruptStatistics::should_ignore_statistics(const std::string& created_by,
+                                                 tparquet::Type::type 
physical_type) {
+    if (physical_type != tparquet::Type::BYTE_ARRAY &&
+        physical_type != tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
+        // The bug only applies to binary columns
+        return false;
+    }
+
+    if (created_by.empty()) {
+        // created_by is not populated
+        VLOG_DEBUG
+                << "Ignoring statistics because created_by is null or empty! 
See PARQUET-251 and "
+                   "PARQUET-297";
+        return true;
+    }
+
+    Status status;
+    std::unique_ptr<ParsedVersion> parsed_version;
+    status = VersionParser::parse(created_by, &parsed_version);
+    if (!status.ok()) {
+        VLOG_DEBUG << "Ignoring statistics because created_by could not be 
parsed (see "
+                      "PARQUET-251)."
+                      " CreatedBy: "
+                   << created_by << ", msg: " << status.msg();
+        return true;
+    }
+
+    if (parsed_version->application() != "parquet-mr") {
+        // Assume other applications don't have this bug
+        return false;
+    }
+
+    if ((!parsed_version->version().has_value()) || 
parsed_version->version().value().empty()) {
+        VLOG_DEBUG << "Ignoring statistics because created_by did not contain 
a semver (see "
+                      "PARQUET-251): "
+                   << created_by;
+        return true;
+    }
+
+    std::unique_ptr<SemanticVersion> semantic_version;
+    status = SemanticVersion::parse(parsed_version->version().value(), 
&semantic_version);
+    if (!status.ok()) {
+        VLOG_DEBUG << "Ignoring statistics because created_by could not be 
parsed (see "
+                      "PARQUET-251)."
+                      " CreatedBy: "
+                   << created_by << ", msg: " << status.msg();
+        return true;
+    }
+    if (semantic_version->compare_to(PARQUET_251_FIXED_VERSION) < 0 &&
+        !(semantic_version->compare_to(CDH_5_PARQUET_251_FIXED_START) >= 0 &&
+          semantic_version->compare_to(CDH_5_PARQUET_251_FIXED_END) < 0)) {
+        VLOG_DEBUG
+                << "Ignoring statistics because this file was created prior to 
the fixed version, "
+                   "see PARQUET-251";
+        return true;
+    }
+
+    // This file was created after the fix
+    return false;
+}
+
 } // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/parquet_common.h 
b/be/src/vec/exec/format/parquet/parquet_common.h
index 2cf745882ee..da374d5fe79 100644
--- a/be/src/vec/exec/format/parquet/parquet_common.h
+++ b/be/src/vec/exec/format/parquet/parquet_common.h
@@ -17,10 +17,12 @@
 
 #pragma once
 
+#include <gen_cpp/parquet_types.h>
 #include <stddef.h>
 
 #include <cstdint>
 #include <ostream>
+#include <regex>
 #include <string>
 #include <vector>
 
@@ -156,4 +158,131 @@ private:
     size_t _num_filtered;
     size_t _read_index;
 };
-} // namespace doris::vectorized
\ No newline at end of file
+
+enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER };
+
+enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN };
+
+class ParsedVersion {
+public:
+    ParsedVersion(std::string application, std::optional<std::string> version,
+                  std::optional<std::string> app_build_hash);
+
+    const std::string& application() const { return _application; }
+
+    const std::optional<std::string>& version() const { return _version; }
+
+    const std::optional<std::string>& app_build_hash() const { return 
_app_build_hash; }
+
+    bool operator==(const ParsedVersion& other) const;
+
+    bool operator!=(const ParsedVersion& other) const;
+
+    size_t hash() const;
+
+    std::string to_string() const;
+
+private:
+    std::string _application;
+    std::optional<std::string> _version;
+    std::optional<std::string> _app_build_hash;
+};
+
+class VersionParser {
+public:
+    static Status parse(const std::string& created_by,
+                        std::unique_ptr<ParsedVersion>* parsed_version);
+};
+
+class SemanticVersion {
+public:
+    SemanticVersion(int major, int minor, int patch);
+
+#ifdef BE_TEST
+    SemanticVersion(int major, int minor, int patch, bool has_unknown);
+#endif
+
+    SemanticVersion(int major, int minor, int patch, 
std::optional<std::string> unknown,
+                    std::optional<std::string> pre, std::optional<std::string> 
build_info);
+
+    static Status parse(const std::string& version,
+                        std::unique_ptr<SemanticVersion>* semantic_version);
+
+    int compare_to(const SemanticVersion& other) const;
+
+    bool operator==(const SemanticVersion& other) const;
+
+    bool operator!=(const SemanticVersion& other) const;
+
+    std::string to_string() const;
+
+private:
+    class NumberOrString {
+    public:
+        explicit NumberOrString(const std::string& value_string);
+
+        NumberOrString(const NumberOrString& other);
+
+        int compare_to(const NumberOrString& that) const;
+        std::string to_string() const;
+
+        bool operator<(const NumberOrString& that) const;
+        bool operator==(const NumberOrString& that) const;
+        bool operator!=(const NumberOrString& that) const;
+        bool operator>(const NumberOrString& that) const;
+        bool operator<=(const NumberOrString& that) const;
+        bool operator>=(const NumberOrString& that) const;
+
+    private:
+        std::string _original;
+        bool _is_numeric;
+        int _number;
+    };
+
+    class Prerelease {
+    public:
+        explicit Prerelease(std::string original);
+
+        int compare_to(const Prerelease& that) const;
+        std::string to_string() const;
+
+        bool operator<(const Prerelease& that) const;
+        bool operator==(const Prerelease& that) const;
+        bool operator!=(const Prerelease& that) const;
+        bool operator>(const Prerelease& that) const;
+        bool operator<=(const Prerelease& that) const;
+        bool operator>=(const Prerelease& that) const;
+
+        const std::string& original() const { return _original; }
+
+    private:
+        static std::vector<std::string> _split(const std::string& s, const 
std::regex& delimiter);
+
+        std::string _original;
+        std::vector<NumberOrString> _identifiers;
+    };
+
+    static int _compare_integers(int x, int y);
+    static int _compare_booleans(bool x, bool y);
+
+    int _major;
+    int _minor;
+    int _patch;
+    bool _prerelease;
+    std::optional<std::string> _unknown;
+    std::optional<Prerelease> _pre;
+    std::optional<std::string> _build_info;
+};
+
+class CorruptStatistics {
+public:
+    static bool should_ignore_statistics(const std::string& created_by,
+                                         tparquet::Type::type physical_type);
+
+private:
+    static const SemanticVersion PARQUET_251_FIXED_VERSION;
+    static const SemanticVersion CDH_5_PARQUET_251_FIXED_START;
+    static const SemanticVersion CDH_5_PARQUET_251_FIXED_END;
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h 
b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
index 916f3f64ee6..316cbc5d716 100644
--- a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
+++ b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
@@ -17,6 +17,7 @@
 
 #pragma once
 
+#include <cmath>
 #include <cstring>
 #include <vector>
 
@@ -38,9 +39,7 @@ class ParquetPredicate {
     M(TYPE_TINYINT, tparquet::Type::INT32)   \
     M(TYPE_SMALLINT, tparquet::Type::INT32)  \
     M(TYPE_INT, tparquet::Type::INT32)       \
-    M(TYPE_BIGINT, tparquet::Type::INT64)    \
-    M(TYPE_FLOAT, tparquet::Type::FLOAT)     \
-    M(TYPE_DOUBLE, tparquet::Type::DOUBLE)
+    M(TYPE_BIGINT, tparquet::Type::INT64)
 
 private:
     struct ScanPredicate {
@@ -132,6 +131,8 @@ private:
 
         CppType min_value;
         CppType max_value;
+        std::unique_ptr<std::string> encoded_min_copy;
+        std::unique_ptr<std::string> encoded_max_copy;
         tparquet::Type::type physical_type = col_schema->physical_type;
         switch (col_val_range.type()) {
 #define DISPATCH(REINTERPRET_TYPE, PARQUET_TYPE)                           \
@@ -142,24 +143,69 @@ private:
         break;
             FOR_REINTERPRET_TYPES(DISPATCH)
 #undef DISPATCH
+        case TYPE_FLOAT:
+            if constexpr (std::is_same_v<CppType, float>) {
+                if (col_schema->physical_type != tparquet::Type::FLOAT) {
+                    return false;
+                }
+                min_value = *reinterpret_cast<const 
CppType*>(encoded_min.data());
+                max_value = *reinterpret_cast<const 
CppType*>(encoded_max.data());
+                if (std::isnan(min_value) || std::isnan(max_value)) {
+                    return false;
+                }
+                // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 
values would be skipped
+                if (std::signbit(min_value) == 0 && min_value == 0.0F) {
+                    min_value = -0.0F;
+                }
+                if (std::signbit(max_value) != 0 && max_value == -0.0F) {
+                    max_value = 0.0F;
+                }
+                break;
+            } else {
+                return false;
+            }
+        case TYPE_DOUBLE:
+            if constexpr (std::is_same_v<CppType, float>) {
+                if (col_schema->physical_type != tparquet::Type::DOUBLE) {
+                    return false;
+                }
+                min_value = *reinterpret_cast<const 
CppType*>(encoded_min.data());
+                max_value = *reinterpret_cast<const 
CppType*>(encoded_max.data());
+                if (std::isnan(min_value) || std::isnan(max_value)) {
+                    return false;
+                }
+                // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 
values would be skipped
+                if (std::signbit(min_value) == 0 && min_value == 0.0) {
+                    min_value = -0.0;
+                }
+                if (std::signbit(max_value) != 0 && max_value == -0.0) {
+                    max_value = 0.0;
+                }
+                break;
+            } else {
+                return false;
+            }
         case TYPE_VARCHAR:
             [[fallthrough]];
         case TYPE_CHAR:
             [[fallthrough]];
         case TYPE_STRING:
-            // TODO: In parquet, min and max statistics may not be able to 
handle UTF8 correctly.
-            // Current processing method is using min_value and max_value 
statistics introduced by PARQUET-1025 if they are used.
-            // If not, current processing method is temporarily ignored. A 
better way is try to read min and max statistics
-            // if it contains only ASCII characters.
-            if (!use_min_max_value) {
-                return false;
-            }
             if constexpr (std::is_same_v<CppType, StringRef>) {
-                min_value = StringRef(encoded_min);
-                max_value = StringRef(encoded_max);
+                if (!use_min_max_value) {
+                    encoded_min_copy = 
std::make_unique<std::string>(encoded_min);
+                    encoded_max_copy = 
std::make_unique<std::string>(encoded_max);
+                    if (!_try_read_old_utf8_stats(*encoded_min_copy, 
*encoded_max_copy)) {
+                        return false;
+                    }
+                    min_value = StringRef(*encoded_min_copy);
+                    max_value = StringRef(*encoded_max_copy);
+                } else {
+                    min_value = StringRef(encoded_min);
+                    max_value = StringRef(encoded_max);
+                }
             } else {
                 return false;
-            };
+            }
             break;
         case TYPE_DECIMALV2:
             if constexpr (std::is_same_v<CppType, DecimalV2Value>) {
@@ -397,9 +443,64 @@ private:
         return predicates;
     }
 
+    static inline bool _is_ascii(uint8_t byte) { return byte < 128; }
+
+    static int _common_prefix(const std::string& encoding_min, const 
std::string& encoding_max) {
+        int min_length = std::min(encoding_min.size(), encoding_max.size());
+        int common_length = 0;
+        while (common_length < min_length &&
+               encoding_min[common_length] == encoding_max[common_length]) {
+            common_length++;
+        }
+        return common_length;
+    }
+
+    static bool _try_read_old_utf8_stats(std::string& encoding_min, 
std::string& encoding_max) {
+        if (encoding_min == encoding_max) {
+            // If min = max, then there is a single value only
+            // No need to modify, just use min
+            encoding_max = encoding_min;
+            return true;
+        } else {
+            int common_prefix_length = _common_prefix(encoding_min, 
encoding_max);
+
+            // For min we can retain all-ASCII, because this produces a 
strictly lower value.
+            int min_good_length = common_prefix_length;
+            while (min_good_length < encoding_min.size() &&
+                   
_is_ascii(static_cast<uint8_t>(encoding_min[min_good_length]))) {
+                min_good_length++;
+            }
+
+            // For max we can be sure only of the part matching the min. When 
they differ, we can consider only one next, and only if both are ASCII
+            int max_good_length = common_prefix_length;
+            if (max_good_length < encoding_max.size() && max_good_length < 
encoding_min.size() &&
+                _is_ascii(static_cast<uint8_t>(encoding_min[max_good_length])) 
&&
+                
_is_ascii(static_cast<uint8_t>(encoding_max[max_good_length]))) {
+                max_good_length++;
+            }
+            // Incrementing 127 would overflow. Incrementing within non-ASCII 
can have side-effects.
+            while (max_good_length > 0 &&
+                   (static_cast<uint8_t>(encoding_max[max_good_length - 1]) == 
127 ||
+                    
!_is_ascii(static_cast<uint8_t>(encoding_max[max_good_length - 1])))) {
+                max_good_length--;
+            }
+            if (max_good_length == 0) {
+                // We can return just min bound, but code downstream likely 
expects both are present or both are absent.
+                return false;
+            }
+
+            encoding_min.resize(min_good_length);
+            encoding_max.resize(max_good_length);
+            if (max_good_length > 0) {
+                encoding_max[max_good_length - 1]++;
+            }
+            return true;
+        }
+    }
+
 public:
     static bool filter_by_stats(const ColumnValueRangeType& col_val_range,
-                                const FieldSchema* col_schema, bool 
is_set_min_max,
+                                const FieldSchema* col_schema, bool 
ignore_min_max_stats,
                                 const std::string& encoded_min, const 
std::string& encoded_max,
                                 bool is_all_null, const cctz::time_zone& ctz,
                                 bool use_min_max_value = false) {
@@ -416,11 +517,14 @@ public:
                             return;
                         }
                     }
-                    for (auto& filter : filters) {
-                        need_filter |= _filter_by_min_max(range, filter, 
col_schema, encoded_min,
-                                                          encoded_max, ctz, 
use_min_max_value);
-                        if (need_filter) {
-                            break;
+                    if (!ignore_min_max_stats) {
+                        for (auto& filter : filters) {
+                            need_filter |=
+                                    _filter_by_min_max(range, filter, 
col_schema, encoded_min,
+                                                       encoded_max, ctz, 
use_min_max_value);
+                            if (need_filter) {
+                                break;
+                            }
                         }
                     }
                 },
diff --git a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp 
b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp
index 35cf076318e..53fb1579c8e 100644
--- a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp
@@ -68,7 +68,7 @@ Status 
PageIndex::collect_skipped_page_range(tparquet::ColumnIndex* column_index
     const int num_of_pages = column_index->null_pages.size();
     for (int page_id = 0; page_id < num_of_pages; page_id++) {
         bool is_all_null = column_index->null_pages[page_id];
-        if (ParquetPredicate::filter_by_stats(col_val_range, col_schema, 
!is_all_null,
+        if (ParquetPredicate::filter_by_stats(col_val_range, col_schema, false,
                                               encoded_min_vals[page_id], 
encoded_max_vals[page_id],
                                               is_all_null, ctz)) {
             skipped_ranges.emplace_back(page_id);
@@ -125,4 +125,4 @@ Status PageIndex::parse_offset_index(const 
tparquet::ColumnChunk& chunk, const u
     return Status::OK();
 }
 
-} // namespace doris::vectorized
\ No newline at end of file
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index 57396c349dd..84c572a3a2f 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -938,15 +938,53 @@ Status ParquetReader::_process_column_stat_filter(const 
std::vector<tparquet::Co
             continue;
         }
         const FieldSchema* col_schema = schema_desc.get_column(col_name);
+        bool ignore_min_max_stats = false;
         // Min-max of statistic is plain-encoded value
-        if (statistic.__isset.min_value) {
+        if (statistic.__isset.min_value && statistic.__isset.max_value) {
+            ColumnOrderName column_order =
+                    col_schema->physical_type == tparquet::Type::INT96 ||
+                                    
col_schema->parquet_schema.logicalType.__isset.UNKNOWN
+                            ? ColumnOrderName::UNDEFINED
+                            : ColumnOrderName::TYPE_DEFINED_ORDER;
+            if ((statistic.min_value != statistic.max_value) &&
+                (column_order != ColumnOrderName::TYPE_DEFINED_ORDER)) {
+                ignore_min_max_stats = true;
+            }
             *filter_group = ParquetPredicate::filter_by_stats(
-                    slot_iter->second, col_schema, is_set_min_max, 
statistic.min_value,
+                    slot_iter->second, col_schema, ignore_min_max_stats, 
statistic.min_value,
                     statistic.max_value, is_all_null, *_ctz, true);
         } else {
+            if (statistic.__isset.min && statistic.__isset.max) {
+                bool max_equals_min = statistic.min == statistic.max;
+
+                SortOrder sort_order = 
_determine_sort_order(col_schema->parquet_schema);
+                bool sort_orders_match = SortOrder::SIGNED == sort_order;
+                if (!sort_orders_match && !max_equals_min) {
+                    ignore_min_max_stats = true;
+                }
+                bool should_ignore_corrupted_stats = false;
+                if (_ignored_stats.count(col_schema->physical_type) == 0) {
+                    if 
(CorruptStatistics::should_ignore_statistics(_t_metadata->created_by,
+                                                                    
col_schema->physical_type)) {
+                        _ignored_stats[col_schema->physical_type] = true;
+                        should_ignore_corrupted_stats = true;
+                    } else {
+                        _ignored_stats[col_schema->physical_type] = false;
+                    }
+                } else if (_ignored_stats[col_schema->physical_type]) {
+                    should_ignore_corrupted_stats = true;
+                }
+                if (should_ignore_corrupted_stats) {
+                    ignore_min_max_stats = true;
+                } else if (!sort_orders_match && !max_equals_min) {
+                    ignore_min_max_stats = true;
+                }
+            } else {
+                ignore_min_max_stats = true;
+            }
             *filter_group = ParquetPredicate::filter_by_stats(
-                    slot_iter->second, col_schema, is_set_min_max, 
statistic.min, statistic.max,
-                    is_all_null, *_ctz, false);
+                    slot_iter->second, col_schema, ignore_min_max_stats, 
statistic.min,
+                    statistic.max, is_all_null, *_ctz, false);
         }
         if (*filter_group) {
             break;
@@ -1021,4 +1059,61 @@ void ParquetReader::_collect_profile_before_close() {
     _collect_profile();
 }
 
+SortOrder ParquetReader::_determine_sort_order(const tparquet::SchemaElement& 
parquet_schema) {
+    tparquet::Type::type physical_type = parquet_schema.type;
+    const tparquet::LogicalType& logical_type = parquet_schema.logicalType;
+
+    // Assume string type is SortOrder::SIGNED, use 
ParquetPredicate::_try_read_old_utf8_stats() to handle it.
+    if (logical_type.__isset.STRING && (physical_type == 
tparquet::Type::BYTE_ARRAY ||
+                                        physical_type == 
tparquet::Type::FIXED_LEN_BYTE_ARRAY)) {
+        return SortOrder::SIGNED;
+    }
+
+    if (logical_type.__isset.INTEGER) {
+        if (logical_type.INTEGER.isSigned) {
+            return SortOrder::SIGNED;
+        } else {
+            return SortOrder::UNSIGNED;
+        }
+    } else if (logical_type.__isset.DATE) {
+        return SortOrder::SIGNED;
+    } else if (logical_type.__isset.ENUM) {
+        return SortOrder::UNSIGNED;
+    } else if (logical_type.__isset.BSON) {
+        return SortOrder::UNSIGNED;
+    } else if (logical_type.__isset.JSON) {
+        return SortOrder::UNSIGNED;
+    } else if (logical_type.__isset.STRING) {
+        return SortOrder::UNSIGNED;
+    } else if (logical_type.__isset.DECIMAL) {
+        return SortOrder::UNKNOWN;
+    } else if (logical_type.__isset.MAP) {
+        return SortOrder::UNKNOWN;
+    } else if (logical_type.__isset.LIST) {
+        return SortOrder::UNKNOWN;
+    } else if (logical_type.__isset.TIME) {
+        return SortOrder::SIGNED;
+    } else if (logical_type.__isset.TIMESTAMP) {
+        return SortOrder::SIGNED;
+    } else if (logical_type.__isset.UNKNOWN) {
+        return SortOrder::UNKNOWN;
+    } else {
+        switch (physical_type) {
+        case tparquet::Type::BOOLEAN:
+        case tparquet::Type::INT32:
+        case tparquet::Type::INT64:
+        case tparquet::Type::FLOAT:
+        case tparquet::Type::DOUBLE:
+            return SortOrder::SIGNED;
+        case tparquet::Type::BYTE_ARRAY:
+        case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
+            return SortOrder::UNSIGNED;
+        case tparquet::Type::INT96:
+            return SortOrder::UNKNOWN;
+        default:
+            return SortOrder::UNKNOWN;
+        }
+    }
+}
+
 } // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h 
b/be/src/vec/exec/format/parquet/vparquet_reader.h
index 3cc262e14e6..9691e596b78 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.h
@@ -220,6 +220,8 @@ private:
             const RowGroupReader::RowGroupIndex& group, size_t* avg_io_size);
     void _collect_profile();
 
+    static SortOrder _determine_sort_order(const tparquet::SchemaElement& 
parquet_schema);
+
 private:
     RuntimeProfile* _profile = nullptr;
     const TFileScanRangeParams& _scan_params;
@@ -284,5 +286,6 @@ private:
     const VExprContextSPtrs* _not_single_slot_filter_conjuncts = nullptr;
     const std::unordered_map<int, VExprContextSPtrs>* 
_slot_id_to_filter_conjuncts = nullptr;
     bool _hive_use_column_names = false;
+    std::unordered_map<tparquet::Type::type, bool> _ignored_stats;
 };
 } // namespace doris::vectorized
diff --git a/be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp 
b/be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp
new file mode 100644
index 00000000000..bad95614f00
--- /dev/null
+++ b/be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris {
+namespace vectorized {
+class ParquetCorruptStatisticsTest : public testing::Test {
+public:
+    ParquetCorruptStatisticsTest() = default;
+};
+
+TEST_F(ParquetCorruptStatisticsTest, test_only_applies_to_binary) {
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version 1.6.0 (build abcd)",
+                                                            
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version 1.6.0 (build abcd)",
+                                                            
tparquet::Type::FIXED_LEN_BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.6.0 (build abcd)", tparquet::Type::DOUBLE));
+}
+
+TEST_F(ParquetCorruptStatisticsTest, test_corrupt_statistics) {
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version 1.6.0 (build abcd)",
+                                                            
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version 1.4.2 (build abcd)",
+                                                            
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.6.100 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.7.999 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.6.22rc99 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.6.22rc99-SNAPSHOT (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.6.1-SNAPSHOT (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.6.0t-01-abcdefg (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("unparseable 
string",
+                                                            
tparquet::Type::BYTE_ARRAY));
+
+    // missing semver
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version (build abcd)",
+                                                            
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version  (build abcd)",
+                                                            
tparquet::Type::BYTE_ARRAY));
+
+    // missing build hash
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version 1.6.0 (build )",
+                                                            
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version 1.6.0 (build)",
+                                                            
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version (build)",
+                                                            
tparquet::Type::BYTE_ARRAY));
+
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("imapla version 
1.6.0 (build abcd)",
+                                                             
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("imapla version 
1.10.0 (build abcd)",
+                                                             
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.8.0 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.8.1 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.8.1rc3 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.8.1rc3-SNAPSHOT (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.9.0 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 2.0.0 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.9.0t-01-abcdefg (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+
+    // missing semver
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version 
(build abcd)",
+                                                             
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version  
(build abcd)",
+                                                             
tparquet::Type::BYTE_ARRAY));
+
+    // missing build hash
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version 
1.6.0 (build )",
+                                                             
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version 
1.6.0 (build)",
+                                                             
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version 
(build)",
+                                                             
tparquet::Type::BYTE_ARRAY));
+}
+
+TEST_F(ParquetCorruptStatisticsTest, test_distribution_corrupt_statistics) {
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.5.0-cdh5.4.999 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.5.0-cdh5.5.0-SNAPSHOT (build "
+            "956ed6c14c611b4c4eaaa1d6e5b9a9c6d4dfa336)",
+            tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.5.0-cdh5.5.0 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.5.0-cdh5.5.1 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_FALSE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.5.0-cdh5.6.0 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics(
+            "parquet-mr version 1.4.10 (build abcd)", 
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version 1.5.0 (build abcd)",
+                                                            
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version 1.5.1 (build abcd)",
+                                                            
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version 1.6.0 (build abcd)",
+                                                            
tparquet::Type::BYTE_ARRAY));
+    EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr 
version 1.7.0 (build abcd)",
+                                                            
tparquet::Type::BYTE_ARRAY));
+}
+
+} // namespace vectorized
+} // namespace doris
diff --git a/be/test/vec/exec/parquet/parquet_statistics_test.cpp 
b/be/test/vec/exec/parquet/parquet_statistics_test.cpp
new file mode 100644
index 00000000000..cd8d3068fe1
--- /dev/null
+++ b/be/test/vec/exec/parquet/parquet_statistics_test.cpp
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_pred_cmp.h"
+
+namespace doris {
+namespace vectorized {
+class ParquetStatisticsTest : public testing::Test {
+public:
+    ParquetStatisticsTest() = default;
+};
+
+TEST_F(ParquetStatisticsTest, test_try_read_old_utf8_stats) {
+    // [, bcé]: min is empty, max starts with ASCII
+    {
+        std::string encoding_min("");
+        std::string encoding_max("bcé");
+        EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+    }
+
+    //    // [, ébc]: min is empty, max starts with non-ASCII
+    {
+        std::string encoding_min("");
+        std::string encoding_max("ébc");
+        EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+    }
+
+    // [aa, bé]: no common prefix, first different are both ASCII, min is all 
ASCII
+    {
+        std::string encoding_min("aa");
+        std::string encoding_max("bé");
+        EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+        EXPECT_EQ(encoding_min, "aa");
+        EXPECT_EQ(encoding_max, "c");
+    }
+
+    // [abcd, abcdN]: common prefix, not only ASCII, one prefix of the other, 
last common ASCII
+    {
+        std::string encoding_min("abcd");
+        std::string encoding_max("abcdN");
+        EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+        EXPECT_EQ(encoding_min, "abcd");
+        EXPECT_EQ(encoding_max, "abce");
+    }
+
+    // [abcé, abcéN]: common prefix, not only ASCII, one prefix of the other, 
last common non ASCII
+    {
+        std::string encoding_min("abcé");
+        std::string encoding_max("abcéN");
+        EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+        EXPECT_EQ(encoding_min, "abcé");
+        EXPECT_EQ(encoding_max, "abd");
+    }
+
+    // [abcéM, abcéN]: common prefix, not only ASCII, first different are both 
ASCII
+    {
+        std::string encoding_min("abcéM");
+        std::string encoding_max("abcéN");
+        EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+        EXPECT_EQ(encoding_min, "abcéM");
+        EXPECT_EQ(encoding_max, "abcéO");
+    }
+
+    // [abcéMab, abcéNxy]: common prefix, not only ASCII, first different are 
both ASCII, more characters afterwards
+    {
+        std::string encoding_min("abcéMab");
+        std::string encoding_max("abcéNxy");
+        EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+        EXPECT_EQ(encoding_min, "abcéMab");
+        EXPECT_EQ(encoding_max, "abcéO");
+    }
+
+    // [abcéM, abcé\u00f7]: common prefix, not only ASCII, first different are 
both ASCII, but need to be chopped off (127)
+    {
+        std::string encoding_min("abcéM");
+        std::string encoding_max("abcé\u00f7");
+        EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        EXPECT_EQ(encoding_min, "abcéM");
+        EXPECT_EQ(encoding_max, "abd");
+    }
+
+    // [abc\u007fé, bcd\u007fé]: no common prefix, first different are both 
ASCII
+    {
+        std::string encoding_min("abc\u007fé");
+        std::string encoding_max("bcd\u007fé");
+        EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+        EXPECT_EQ(encoding_min, "abc\u007f");
+        EXPECT_EQ(encoding_max, "c");
+    }
+
+    // [é, a]: no common prefix, first different are not both ASCII
+    {
+        std::string encoding_min("é");
+        std::string encoding_max("a");
+        EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+    }
+
+    // [é, ê]: no common prefix, first different are both not ASCII
+    {
+        std::string encoding_min("é");
+        std::string encoding_max("ê");
+        EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+    }
+
+    // [aé, aé]: min = max (common prefix, first different are both not ASCII)
+    {
+        std::string encoding_min("aé");
+        std::string encoding_max("aé");
+        EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+        EXPECT_EQ(encoding_min, "aé");
+        EXPECT_EQ(encoding_max, "aé");
+    }
+
+    // [aé, bé]: no common prefix, first different are both ASCII
+    {
+        std::string encoding_min("aé");
+        std::string encoding_max("bé");
+        EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, 
encoding_max));
+        ;
+        EXPECT_EQ(encoding_min, "a");
+        EXPECT_EQ(encoding_max, "c");
+    }
+}
+
+} // namespace vectorized
+} // namespace doris
diff --git a/be/test/vec/exec/parquet/parquet_version_test.cpp 
b/be/test/vec/exec/parquet/parquet_version_test.cpp
new file mode 100644
index 00000000000..10d17e27790
--- /dev/null
+++ b/be/test/vec/exec/parquet/parquet_version_test.cpp
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <regex>
+
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris {
+namespace vectorized {
+class ParquetVersionTest : public testing::Test {
+public:
+    ParquetVersionTest() = default;
+};
+
+TEST_F(ParquetVersionTest, test_version_parser) {
+    std::unique_ptr<ParsedVersion> parsed_version;
+
+    Status status = VersionParser::parse("parquet-mr version 1.6.0 (build 
abcd)", &parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", "abcd"), *parsed_version);
+
+    status = VersionParser::parse("parquet-mr version 1.6.22rc99-SNAPSHOT 
(build abcd)",
+                                  &parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.22rc99-SNAPSHOT", "abcd"), 
*parsed_version);
+
+    status = VersionParser::parse("unparseable string", &parsed_version);
+    EXPECT_FALSE(status.ok());
+
+    // missing semver
+    status = VersionParser::parse("parquet-mr version (build abcd)", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, "abcd"), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr version  (build abcd)", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, "abcd"), 
*parsed_version);
+
+    // missing build hash
+    status = VersionParser::parse("parquet-mr version 1.6.0 (build )", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr version 1.6.0 (build)", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr version (build)", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr version (build )", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), 
*parsed_version);
+
+    // Missing entire build section
+    status = VersionParser::parse("parquet-mr version 1.6.0", &parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr version 1.8.0rc4", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4", std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr version 1.8.0rc4-SNAPSHOT", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4-SNAPSHOT", std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr version", &parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), 
*parsed_version);
+
+    // Various spaces
+    status = VersionParser::parse("parquet-mr     version    1.6.0", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr     version    1.8.0rc4", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4", std::nullopt), 
*parsed_version);
+
+    status =
+            VersionParser::parse("parquet-mr      version    1.8.0rc4-SNAPSHOT 
 ", &parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4-SNAPSHOT", std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr      version", &parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr version 1.6.0 (  build )", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr     version 1.6.0 (    build)", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr     version (    build)", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), 
*parsed_version);
+
+    status = VersionParser::parse("parquet-mr    version    (build    )", 
&parsed_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), 
*parsed_version);
+}
+
+void assertLessThan(const std::string& a, const std::string& b) {
+    std::unique_ptr<SemanticVersion> version_a;
+    Status status = SemanticVersion::parse(a, &version_a);
+    EXPECT_TRUE(status.ok());
+    std::unique_ptr<SemanticVersion> version_b;
+    status = SemanticVersion::parse(b, &version_b);
+    EXPECT_TRUE(status.ok());
+    EXPECT_LT(version_a->compare_to(*version_b), 0) << a << " should be < " << 
b;
+    EXPECT_GT(version_b->compare_to(*version_a), 0) << b << " should be > " << 
a;
+}
+
+void assertEqualTo(const std::string& a, const std::string& b) {
+    std::unique_ptr<SemanticVersion> version_a;
+    Status status = SemanticVersion::parse(a, &version_a);
+    EXPECT_TRUE(status.ok());
+    std::unique_ptr<SemanticVersion> version_b;
+    status = SemanticVersion::parse(b, &version_b);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(version_a->compare_to(*version_b), 0) << a << " should equal " 
<< b;
+}
+
+TEST_F(ParquetVersionTest, test_compare) {
+    EXPECT_EQ(SemanticVersion(1, 8, 1).compare_to(SemanticVersion(1, 8, 1)), 
0);
+    EXPECT_LT(SemanticVersion(1, 8, 0).compare_to(SemanticVersion(1, 8, 1)), 
0);
+    EXPECT_GT(SemanticVersion(1, 8, 2).compare_to(SemanticVersion(1, 8, 1)), 
0);
+
+    EXPECT_EQ(SemanticVersion(1, 8, 1).compare_to(SemanticVersion(1, 8, 1)), 
0);
+    EXPECT_LT(SemanticVersion(1, 8, 0).compare_to(SemanticVersion(1, 8, 1)), 
0);
+    EXPECT_GT(SemanticVersion(1, 8, 2).compare_to(SemanticVersion(1, 8, 1)), 
0);
+
+    EXPECT_LT(SemanticVersion(1, 7, 0).compare_to(SemanticVersion(1, 8, 0)), 
0);
+    EXPECT_GT(SemanticVersion(1, 9, 0).compare_to(SemanticVersion(1, 8, 0)), 
0);
+
+    EXPECT_LT(SemanticVersion(0, 0, 0).compare_to(SemanticVersion(1, 0, 0)), 
0);
+    EXPECT_GT(SemanticVersion(2, 0, 0).compare_to(SemanticVersion(1, 0, 0)), 
0);
+
+    EXPECT_LT(SemanticVersion(1, 8, 100).compare_to(SemanticVersion(1, 9, 0)), 
0);
+
+    EXPECT_GT(SemanticVersion(1, 8, 0).compare_to(SemanticVersion(1, 8, 0, 
true)), 0);
+    EXPECT_EQ(SemanticVersion(1, 8, 0, true).compare_to(SemanticVersion(1, 8, 
0, true)), 0);
+    EXPECT_LT(SemanticVersion(1, 8, 0, true).compare_to(SemanticVersion(1, 8, 
0)), 0);
+}
+
+TEST_F(ParquetVersionTest, test_semver_prerelease_examples) {
+    std::vector<std::string> examples = {"1.0.0-alpha", "1.0.0-alpha.1", 
"1.0.0-alpha.beta",
+                                         "1.0.0-beta",  "1.0.0-beta.2",  
"1.0.0-beta.11",
+                                         "1.0.0-rc.1",  "1.0.0"};
+    for (size_t i = 0; i < examples.size() - 1; ++i) {
+        assertLessThan(examples[i], examples[i + 1]);
+        assertEqualTo(examples[i], examples[i]);
+    }
+    assertEqualTo(examples.back(), examples.back());
+}
+
+TEST_F(ParquetVersionTest, test_semver_build_info_examples) {
+    assertEqualTo("1.0.0-alpha+001", "1.0.0-alpha+001");
+    assertEqualTo("1.0.0-alpha", "1.0.0-alpha+001");
+    assertEqualTo("1.0.0+20130313144700", "1.0.0+20130313144700");
+    assertEqualTo("1.0.0", "1.0.0+20130313144700");
+    assertEqualTo("1.0.0-beta+exp.sha.5114f85", "1.0.0-beta+exp.sha.5114f85");
+    assertEqualTo("1.0.0-beta", "1.0.0-beta+exp.sha.5114f85");
+}
+
+TEST_F(ParquetVersionTest, test_unknown_comparisons) {
+    assertLessThan("1.0.0rc0-alpha+001", "1.0.0-alpha");
+}
+
+TEST_F(ParquetVersionTest, test_distribution_versions) {
+    assertEqualTo("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.0");
+    assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.1");
+    assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.1-SNAPSHOT");
+    assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.6.0");
+    assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh6.0.0");
+    assertLessThan("1.5.0-cdh5.5.0", "1.5.0");
+    assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.0-SNAPSHOT");
+}
+
+TEST_F(ParquetVersionTest, test_parse) {
+    std::unique_ptr<SemanticVersion> semantic_version;
+    Status status = SemanticVersion::parse("1.8.0", &semantic_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0));
+    status = SemanticVersion::parse("1.8.0rc3", &semantic_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0, true));
+    status = SemanticVersion::parse("1.8.0rc3-SNAPSHOT", &semantic_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0, "rc3", "SNAPSHOT", 
std::nullopt));
+    status = SemanticVersion::parse("1.8.0-SNAPSHOT", &semantic_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0, std::nullopt, 
"SNAPSHOT", std::nullopt));
+    status = SemanticVersion::parse("1.5.0-cdh5.5.0", &semantic_version);
+    EXPECT_TRUE(status.ok());
+    EXPECT_EQ(*semantic_version, SemanticVersion(1, 5, 0, std::nullopt, 
"cdh5.5.0", std::nullopt));
+}
+
+} // namespace vectorized
+} // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-2.1 updated: [Fix](parquet-reader) Fix and optimize parquet min-max filtering. (#39375)

Reply via email to