[GitHub] [doris] github-actions[bot] commented on a diff in pull request #11579: [Feature](NGram BloomFilter Index) add new ngram bloom filter index to speed up like query

GitBox Fri, 16 Dec 2022 03:50:35 -0800


github-actions[bot] commented on code in PR #11579:
URL: https://github.com/apache/doris/pull/11579#discussion_r1050666869



##########
be/src/olap/itoken_extractor.cpp:
##########
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "itoken_extractor.h"
+
+#include "util/simd/vstring_function.h"
+
+namespace doris {
+
+bool NgramTokenExtractor::nextInString(const char* data, size_t length, 
size_t* __restrict pos,
+                                       size_t* __restrict token_start,
+                                       size_t* __restrict token_length) const {
+    *token_start = *pos;
+    *token_length = 0;
+    size_t code_points = 0;
+    for (; code_points < n && *token_start + *token_length < length; 
++code_points) {
+        size_t sz = 
get_utf8_byte_length(static_cast<uint8_t>(data[*token_start + *token_length]));
+        *token_length += sz;
+    }
+    *pos += get_utf8_byte_length(static_cast<uint8_t>(data[*pos]));
+    return code_points == n;
+}
+
+bool NgramTokenExtractor::nextInStringLike(const char* data, size_t length, 
size_t* pos,
+                                           std::string& token) const {
+    token.clear();
+
+    size_t code_points = 0;
+    bool escaped = false;
+    for (size_t i = *pos; i < length;) {
+        if (escaped && (data[i] == '%' || data[i] == '_' || data[i] == '\\')) {
+            token += data[i];
+            ++code_points;
+            escaped = false;
+            ++i;
+        } else if (!escaped && (data[i] == '%' || data[i] == '_')) {
+            /// This token is too small, go to the next.
+            token.clear();
+            code_points = 0;
+            escaped = false;
+            *pos = ++i;
+        } else if (!escaped && data[i] == '\\') {
+            escaped = true;
+            ++i;
+        } else {
+            const size_t sz = 
get_utf8_byte_length(static_cast<uint8_t>(data[i]));
+            for (size_t j = 0; j < sz; ++j) token += data[i + j];

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
               for (size_t j = 0; j < sz; ++j) { token += data[i + j];
   }
   ```
   



##########
be/src/olap/itoken_extractor.h:
##########
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef DORIS_ITOKEN_EXTRACTOR_H
+#define DORIS_ITOKEN_EXTRACTOR_H
+
+#include <stddef.h>
+
+#include <string>
+
+#include "olap/rowset/segment_v2/bloom_filter.h"
+
+namespace doris {
+
+/// Interface for string parsers.
+struct ITokenExtractor {
+    virtual ~ITokenExtractor() = default;
+
+    /// Fast inplace implementation for regular use.
+    /// Gets string (data ptr and len) and start position for extracting next 
token (state of extractor).
+    /// Returns false if parsing is finished, otherwise returns true.
+    virtual bool nextInString(const char* data, size_t length, size_t* 
__restrict pos,
+                              size_t* __restrict token_start,
+                              size_t* __restrict token_length) const = 0;
+
+    /// Special implementation for creating bloom filter for LIKE function.
+    /// It skips unescaped `%` and `_` and supports escaping symbols, but it 
is less lightweight.
+    virtual bool nextInStringLike(const char* data, size_t length, size_t* pos,
+                                  std::string& out) const = 0;
+
+    virtual void stringToBloomFilter(const char* data, size_t length,
+                                     segment_v2::BloomFilter& bloom_filter) 
const = 0;
+
+    virtual bool stringLikeToBloomFilter(const char* data, size_t length,
+                                         segment_v2::BloomFilter& 
bloom_filter) const = 0;
+};
+
+template <typename Derived>
+class ITokenExtractorHelper : public ITokenExtractor {
+public:
+    void stringToBloomFilter(const char* data, size_t length,
+                             segment_v2::BloomFilter& bloom_filter) const 
override {
+        size_t cur = 0;
+        size_t token_start = 0;
+        size_t token_len = 0;
+
+        while (cur < length && static_cast<const Derived*>(this)->nextInString(
+                                       data, length, &cur, &token_start, 
&token_len))
+            bloom_filter.add_bytes(data + token_start, token_len);
+    }
+
+    bool stringLikeToBloomFilter(const char* data, size_t length,
+                                 segment_v2::BloomFilter& bloom_filter) const 
override {
+        size_t cur = 0;
+        bool added = false;
+        std::string token;
+        while (cur < length &&
+               static_cast<const Derived*>(this)->nextInStringLike(data, 
length, &cur, token)) {
+            bloom_filter.add_bytes(token.data(), token.size());
+            added = true;
+        }
+
+        return added;
+    }
+};
+
+/// Parser extracting all ngrams from string.
+struct NgramTokenExtractor final : public 
ITokenExtractorHelper<NgramTokenExtractor> {
+public:
+    explicit NgramTokenExtractor(size_t n_) : n(n_) {}
+
+    bool nextInString(const char* data, size_t length, size_t* __restrict pos,
+                      size_t* __restrict token_start, size_t* __restrict 
token_length) const;
+
+    bool nextInStringLike(const char* data, size_t length, size_t* pos, 
std::string& token) const;

Review Comment:
   warning: annotate this function with 'override' or (rarely) 'final' 
[modernize-use-override]
   
   ```suggestion
       bool nextInStringLike(const char* data, size_t length, size_t* pos, 
std::string& token) const override;
   ```
   



##########
be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp:
##########
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/ngram_bloom_filter.h"
+
+#include "util/cityhash102/city.h"
+#include "util/debug_util.h"
+
+namespace doris {
+namespace segment_v2 {
+
+static constexpr uint64_t SEED_GEN = 217728422;
+
+NGramBloomFilter::NGramBloomFilter(size_t size)
+        : _size(size),
+          words((size + sizeof(UnderType) - 1) / sizeof(UnderType)),
+          filter(words, 0) {}
+
+// for read
+Status NGramBloomFilter::init(const char* buf, uint32_t size, HashStrategyPB 
strategy) {
+    if (size == 0) {
+        return Status::InvalidArgument(strings::Substitute("invalid size:$0", 
size));
+    }
+    DCHECK(_size == size);
+
+    if (strategy != CITY_HASH_64) {
+        return Status::InvalidArgument(strings::Substitute("invalid 
strategy:$0", strategy));
+    }
+    words = (_size + sizeof(UnderType) - 1) / sizeof(UnderType);
+    filter.reserve(words);
+    const UnderType* from = reinterpret_cast<const UnderType*>(buf);
+    for (size_t i = 0; i < words; ++i) {
+        filter[i] = from[i];
+    }
+
+    return Status::OK();
+}
+
+void NGramBloomFilter::add_bytes(const char* data, uint32_t len) {
+    size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, 0);
+    size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN);
+
+    for (size_t i = 0; i < HASH_FUNCTIONS; ++i) {
+        size_t pos = (hash1 + i * hash2 + i * i) % (8 * _size);
+        filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * 
sizeof(UnderType))));
+    }
+}
+
+bool NGramBloomFilter::contains(const BloomFilter& bf_) const {
+    const NGramBloomFilter& bf = static_cast<const NGramBloomFilter&>(bf_);
+    for (size_t i = 0; i < words; ++i) {
+        if ((filter[i] & bf.filter[i]) != bf.filter[i]) return false;

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
           if ((filter[i] & bf.filter[i]) != bf.filter[i]) { return false;
   }
   ```
   



##########
be/src/olap/tablet_schema.h:
##########
@@ -129,6 +132,18 @@ class TabletIndex {
     const IndexType index_type() const { return _index_type; }
     const vector<int32_t>& col_unique_ids() const { return _col_unique_ids; }
     const std::map<string, string>& properties() const { return _properties; }
+    int32_t get_gram_size() const {
+        if (_properties.count("gram_size"))

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
           if (_properties.count("gram_size")) {
   ```
   
   be/src/olap/tablet_schema.h:137:
   ```diff
   -         else
   +         } else
   ```
   



##########
be/src/olap/tablet_schema.h:
##########
@@ -129,6 +132,18 @@
     const IndexType index_type() const { return _index_type; }
     const vector<int32_t>& col_unique_ids() const { return _col_unique_ids; }
     const std::map<string, string>& properties() const { return _properties; }
+    int32_t get_gram_size() const {
+        if (_properties.count("gram_size"))
+            return std::stoi(_properties.at("gram_size"));
+        else
+            return 0;
+    }
+    int32_t get_gram_bf_size() const {
+        if (_properties.count("bf_size"))
+            return std::stoi(_properties.at("bf_size"));
+        else
+            return 0;

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
           else {
               return 0;
   }
   ```
   



##########
be/src/olap/itoken_extractor.h:
##########
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef DORIS_ITOKEN_EXTRACTOR_H
+#define DORIS_ITOKEN_EXTRACTOR_H
+
+#include <stddef.h>
+
+#include <string>
+
+#include "olap/rowset/segment_v2/bloom_filter.h"
+
+namespace doris {
+
+/// Interface for string parsers.
+struct ITokenExtractor {
+    virtual ~ITokenExtractor() = default;
+
+    /// Fast inplace implementation for regular use.
+    /// Gets string (data ptr and len) and start position for extracting next 
token (state of extractor).
+    /// Returns false if parsing is finished, otherwise returns true.
+    virtual bool nextInString(const char* data, size_t length, size_t* 
__restrict pos,
+                              size_t* __restrict token_start,
+                              size_t* __restrict token_length) const = 0;
+
+    /// Special implementation for creating bloom filter for LIKE function.
+    /// It skips unescaped `%` and `_` and supports escaping symbols, but it 
is less lightweight.
+    virtual bool nextInStringLike(const char* data, size_t length, size_t* pos,
+                                  std::string& out) const = 0;
+
+    virtual void stringToBloomFilter(const char* data, size_t length,
+                                     segment_v2::BloomFilter& bloom_filter) 
const = 0;
+
+    virtual bool stringLikeToBloomFilter(const char* data, size_t length,
+                                         segment_v2::BloomFilter& 
bloom_filter) const = 0;
+};
+
+template <typename Derived>
+class ITokenExtractorHelper : public ITokenExtractor {
+public:
+    void stringToBloomFilter(const char* data, size_t length,
+                             segment_v2::BloomFilter& bloom_filter) const 
override {
+        size_t cur = 0;
+        size_t token_start = 0;
+        size_t token_len = 0;
+
+        while (cur < length && static_cast<const Derived*>(this)->nextInString(
+                                       data, length, &cur, &token_start, 
&token_len))
+            bloom_filter.add_bytes(data + token_start, token_len);
+    }
+
+    bool stringLikeToBloomFilter(const char* data, size_t length,
+                                 segment_v2::BloomFilter& bloom_filter) const 
override {
+        size_t cur = 0;
+        bool added = false;
+        std::string token;
+        while (cur < length &&
+               static_cast<const Derived*>(this)->nextInStringLike(data, 
length, &cur, token)) {
+            bloom_filter.add_bytes(token.data(), token.size());
+            added = true;
+        }
+
+        return added;
+    }
+};
+
+/// Parser extracting all ngrams from string.
+struct NgramTokenExtractor final : public 
ITokenExtractorHelper<NgramTokenExtractor> {
+public:
+    explicit NgramTokenExtractor(size_t n_) : n(n_) {}
+
+    bool nextInString(const char* data, size_t length, size_t* __restrict pos,

Review Comment:
   warning: annotate this function with 'override' or (rarely) 'final' 
[modernize-use-override]
   
   be/src/olap/itoken_extractor.h:86:
   ```diff
   -                       size_t* __restrict token_start, size_t* __restrict 
token_length) const;
   +                       size_t* __restrict token_start, size_t* __restrict 
token_length) const override;
   ```
   



##########
be/src/olap/like_column_predicate.h:
##########
@@ -53,6 +53,20 @@ class LikeColumnPredicate : public ColumnPredicate {
     void evaluate_and_vec(const vectorized::IColumn& column, uint16_t size,
                           bool* flags) const override;
 
+    std::string get_search_str() const override {
+        return std::string(reinterpret_cast<char*>(pattern.ptr), pattern.len);
+    }
+    bool is_opposite() const { return _opposite; }
+
+    void set_page_ng_bf(std::unique_ptr<segment_v2::BloomFilter> src) override 
{
+        _page_ng_bf = std::move(src);
+    }
+    bool evaluate_and(const BloomFilter* bf) const override {
+        if (_page_ng_bf) return bf->contains(*_page_ng_bf);

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
           if (_page_ng_bf) { return bf->contains(*_page_ng_bf);
   }
   ```
   



##########
be/src/olap/itoken_extractor.h:
##########
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef DORIS_ITOKEN_EXTRACTOR_H
+#define DORIS_ITOKEN_EXTRACTOR_H
+
+#include <stddef.h>
+
+#include <string>
+
+#include "olap/rowset/segment_v2/bloom_filter.h"
+
+namespace doris {
+
+/// Interface for string parsers.
+struct ITokenExtractor {
+    virtual ~ITokenExtractor() = default;
+
+    /// Fast inplace implementation for regular use.
+    /// Gets string (data ptr and len) and start position for extracting next 
token (state of extractor).
+    /// Returns false if parsing is finished, otherwise returns true.
+    virtual bool nextInString(const char* data, size_t length, size_t* 
__restrict pos,
+                              size_t* __restrict token_start,
+                              size_t* __restrict token_length) const = 0;
+
+    /// Special implementation for creating bloom filter for LIKE function.
+    /// It skips unescaped `%` and `_` and supports escaping symbols, but it 
is less lightweight.
+    virtual bool nextInStringLike(const char* data, size_t length, size_t* pos,
+                                  std::string& out) const = 0;
+
+    virtual void stringToBloomFilter(const char* data, size_t length,
+                                     segment_v2::BloomFilter& bloom_filter) 
const = 0;
+
+    virtual bool stringLikeToBloomFilter(const char* data, size_t length,
+                                         segment_v2::BloomFilter& 
bloom_filter) const = 0;
+};
+
+template <typename Derived>
+class ITokenExtractorHelper : public ITokenExtractor {
+public:
+    void stringToBloomFilter(const char* data, size_t length,
+                             segment_v2::BloomFilter& bloom_filter) const 
override {
+        size_t cur = 0;
+        size_t token_start = 0;
+        size_t token_len = 0;
+
+        while (cur < length && static_cast<const Derived*>(this)->nextInString(
+                                       data, length, &cur, &token_start, 
&token_len))
+            bloom_filter.add_bytes(data + token_start, token_len);

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
                                          data, length, &cur, &token_start, 
&token_len)) {
               bloom_filter.add_bytes(data + token_start, token_len);
   }
   ```
   



##########
be/src/olap/reader.cpp:
##########
@@ -463,8 +469,33 @@
     }
 
     // Function filter push down to storage engine
+    auto is_like_predicate = [](ColumnPredicate* _pred) {
+        if (dynamic_cast<LikeColumnPredicate<false>*>(_pred) ||
+            dynamic_cast<LikeColumnPredicate<true>*>(_pred))
+            return true;
+
+        return false;
+    };
+
     for (const auto& filter : read_params.function_filters) {
         _col_predicates.emplace_back(_parse_to_predicate(filter));
+        auto* pred = _col_predicates.back();
+        const auto& col = _tablet->tablet_schema()->column(pred->column_id());
+        auto is_like = is_like_predicate(pred);
+        auto* tablet_index = 
_tablet->tablet_schema()->get_ngram_bf_index(col.unique_id());
+
+        if (is_like && tablet_index && config::enable_query_like_bloom_filter) 
{
+            std::unique_ptr<segment_v2::BloomFilter> ng_bf;
+            std::string pattern = pred->get_search_str();
+            auto gram_bf_size = tablet_index->get_gram_bf_size();
+            auto gram_size = tablet_index->get_gram_size();
+
+            segment_v2::BloomFilter::create(segment_v2::NGRAM_BLOOM_FILTER, 
&ng_bf, gram_bf_size);
+            NgramTokenExtractor _token_extractor(gram_size);
+
+            if (_token_extractor.stringLikeToBloomFilter(pattern.data(), 
pattern.length(), *ng_bf))
+                pred->set_page_ng_bf(std::move(ng_bf));

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
               if (_token_extractor.stringLikeToBloomFilter(pattern.data(), 
pattern.length(), *ng_bf)) {
                   pred->set_page_ng_bf(std::move(ng_bf));
   }
   ```
   



##########
be/src/olap/rowset/segment_v2/bloom_filter.h:
##########
@@ -51,17 +51,24 @@ class BloomFilter {
     static const uint32_t MAXIMUM_BYTES = 128 * 1024 * 1024;
 
     // Factory function for BloomFilter
-    static Status create(BloomFilterAlgorithmPB algorithm, 
std::unique_ptr<BloomFilter>* bf);
+    static Status create(BloomFilterAlgorithmPB algorithm, 
std::unique_ptr<BloomFilter>* bf,
+                         size_t bf_size = 0);
 
     BloomFilter() : _data(nullptr), _num_bytes(0), _size(0), 
_has_null(nullptr) {}
 
-    virtual ~BloomFilter() { delete[] _data; }
+    virtual ~BloomFilter() {
+        if (_data) delete[] _data;

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
           if (_data) { delete[] _data;
   }
   ```
   



##########
be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp:
##########
@@ -170,6 +170,62 @@ class BloomFilterIndexWriterImpl : public 
BloomFilterIndexWriter {
 
 } // namespace
 
+NGramBloomFilterIndexWriterImpl::NGramBloomFilterIndexWriterImpl(
+        const BloomFilterOptions& bf_options, uint8_t gram_size, uint16_t 
bf_size)
+        : _bf_options(bf_options),
+          _gram_size(gram_size),
+          _bf_size(bf_size),
+          _pool(),

Review Comment:
   warning: initializer for member '_pool' is redundant 
[readability-redundant-member-init]
   
   ```suggestion
             ,
   ```
   



##########
be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h:
##########
@@ -58,5 +60,32 @@ class BloomFilterIndexWriter {
     DISALLOW_COPY_AND_ASSIGN(BloomFilterIndexWriter);
 };
 
+class NGramBloomFilterIndexWriterImpl : public BloomFilterIndexWriter {
+public:
+    static Status create(const BloomFilterOptions& bf_options, const TypeInfo* 
typeinfo,
+                         uint8_t gram_size, uint16_t gram_bf_size,
+                         std::unique_ptr<BloomFilterIndexWriter>* res);
+
+    NGramBloomFilterIndexWriterImpl(const BloomFilterOptions& bf_options, 
uint8_t gram_size,
+                                    uint16_t bf_size);
+    void add_values(const void* values, size_t count) override;
+    void add_nulls(uint32_t) override {}
+    Status flush() override;
+    Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) 
override;
+    uint64_t size() override;
+
+    ~NGramBloomFilterIndexWriterImpl() = default;

Review Comment:
   warning: annotate this function with 'override' or (rarely) 'final' 
[modernize-use-override]
   
   ```suggestion
       ~NGramBloomFilterIndexWriterImpl() override = default;
   ```
   



##########
be/src/olap/reader.cpp:
##########
@@ -463,8 +469,33 @@ void TabletReader::_init_conditions_param(const 
ReaderParams& read_params) {
     }
 
     // Function filter push down to storage engine
+    auto is_like_predicate = [](ColumnPredicate* _pred) {
+        if (dynamic_cast<LikeColumnPredicate<false>*>(_pred) ||
+            dynamic_cast<LikeColumnPredicate<true>*>(_pred))
+            return true;

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
               dynamic_cast<LikeColumnPredicate<true>*>(_pred)) {
               return true;
   }
   ```
   



##########
be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp:
##########
@@ -170,6 +170,62 @@
 
 } // namespace
 
+NGramBloomFilterIndexWriterImpl::NGramBloomFilterIndexWriterImpl(
+        const BloomFilterOptions& bf_options, uint8_t gram_size, uint16_t 
bf_size)
+        : _bf_options(bf_options),
+          _gram_size(gram_size),
+          _bf_size(bf_size),
+          _pool(),
+          _bf_buffer_size(0),
+          _token_extractor(gram_size) {
+    BloomFilter::create(NGRAM_BLOOM_FILTER, &_bf, bf_size);
+}
+
+void NGramBloomFilterIndexWriterImpl::add_values(const void* values, size_t 
count) {
+    const Slice* src = reinterpret_cast<const Slice*>(values);
+    for (int i = 0; i < count; ++i, ++src) {
+        if (src->size < _gram_size) continue;

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
           if (src->size < _gram_size) { continue;
   }
   ```
   



##########
be/src/olap/rowset/segment_v2/column_writer.cpp:
##########
@@ -296,9 +296,15 @@ Status ScalarColumnWriter::init() {
         RETURN_IF_ERROR(
                 BitmapIndexWriter::create(get_field()->type_info(), 
&_bitmap_index_builder));
     }
+
     if (_opts.need_bloom_filter) {
-        RETURN_IF_ERROR(BloomFilterIndexWriter::create(
-                BloomFilterOptions(), get_field()->type_info(), 
&_bloom_filter_index_builder));
+        if (_opts.is_ngram_bf_index)

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
           if (_opts.is_ngram_bf_index) {
   ```
   
   be/src/olap/rowset/segment_v2/column_writer.cpp:304:
   ```diff
   -         else
   +         } else
   ```
   



##########
be/src/olap/rowset/segment_v2/ngram_bloom_filter.h:
##########
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "olap/rowset/segment_v2/bloom_filter.h"
+
+namespace doris {
+namespace segment_v2 {
+
+class NGramBloomFilter : public BloomFilter {
+public:
+    // Fixed hash function number
+    static const size_t HASH_FUNCTIONS = 2;
+    using UnderType = uint64_t;
+    NGramBloomFilter(size_t size);
+    void add_bytes(const char* data, uint32_t len) override;
+    bool contains(const BloomFilter& bf_) const override;
+    virtual Status init(const char* buf, uint32_t size, HashStrategyPB 
strategy) override;

Review Comment:
   warning: 'virtual' is redundant since the function is already declared 
'override' [modernize-use-override]
   
   ```suggestion
       Status init(const char* buf, uint32_t size, HashStrategyPB strategy) 
override;
   ```
   



##########
be/src/olap/rowset/segment_v2/column_writer.cpp:
##########
@@ -296,9 +296,15 @@
         RETURN_IF_ERROR(
                 BitmapIndexWriter::create(get_field()->type_info(), 
&_bitmap_index_builder));
     }
+
     if (_opts.need_bloom_filter) {
-        RETURN_IF_ERROR(BloomFilterIndexWriter::create(
-                BloomFilterOptions(), get_field()->type_info(), 
&_bloom_filter_index_builder));
+        if (_opts.is_ngram_bf_index)
+            RETURN_IF_ERROR(NGramBloomFilterIndexWriterImpl::create(
+                    BloomFilterOptions(), get_field()->type_info(), 
_opts.gram_size,
+                    _opts.gram_bf_size, &_bloom_filter_index_builder));
+        else

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
           else {
   ```
   
   be/src/olap/rowset/segment_v2/column_writer.cpp:306:
   ```diff
   -                     BloomFilterOptions(), get_field()->type_info(), 
&_bloom_filter_index_builder));
   +                     BloomFilterOptions(), get_field()->type_info(), 
&_bloom_filter_index_builder));
   + }
   ```
   



##########
be/src/util/cityhash102/city.h:
##########
@@ -0,0 +1,104 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to 
deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// This file provides a few functions for hashing strings. On x86-64
+// hardware in 2011, CityHash64() is faster than other high-quality
+// hash functions, such as Murmur.  This is largely due to higher
+// instruction-level parallelism.  CityHash64() and CityHash128() also perform
+// well on hash-quality tests.
+//
+// CityHash128() is optimized for relatively long strings and returns
+// a 128-bit hash.  For strings more than about 2000 bytes it can be
+// faster than CityHash64().
+//
+// Functions in the CityHash family are not suitable for cryptography.
+//
+// WARNING: This code has not been tested on big-endian platforms!
+// It is known to work well on little-endian platforms that have a small 
penalty
+// for unaligned reads, such as current Intel and AMD moderate-to-high-end 
CPUs.
+//
+// By the way, for some hash functions, given strings a and b, the hash
+// of a+b is easily derived from the hashes of a and b.  This property
+// doesn't hold for any hash functions in this file.
+
+#ifndef CITY_HASH_H_
+#define CITY_HASH_H_
+
+#include <stdlib.h>  // for size_t.
+#include <stdint.h>
+#include <utility>
+
+/** This is a version of CityHash that predates v1.0.3 algorithm change.
+  * Why we need exactly this version?
+  * Although hash values of CityHash are not recommended for storing 
persistently anywhere,
+  * it has already been used this way in ClickHouse:
+  * - for calculation of checksums of compressed chunks and for data parts;
+  * - this version of CityHash is exposed in cityHash64 function in ClickHouse 
SQL language;
+  * - and already used by many users for data ordering, sampling and sharding.
+  */
+namespace CityHash_v1_0_2
+{
+
+typedef uint8_t uint8;
+typedef uint32_t uint32;
+typedef uint64_t uint64;
+typedef std::pair<uint64, uint64> uint128;

Review Comment:
   warning: use 'using' instead of 'typedef' [modernize-use-using]
   
   ```suggestion
   using uint128 = std::pair<uint64, uint64>;
   ```
   



##########
be/src/util/cityhash102/city.h:
##########
@@ -0,0 +1,104 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to 
deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// This file provides a few functions for hashing strings. On x86-64
+// hardware in 2011, CityHash64() is faster than other high-quality
+// hash functions, such as Murmur.  This is largely due to higher
+// instruction-level parallelism.  CityHash64() and CityHash128() also perform
+// well on hash-quality tests.
+//
+// CityHash128() is optimized for relatively long strings and returns
+// a 128-bit hash.  For strings more than about 2000 bytes it can be
+// faster than CityHash64().
+//
+// Functions in the CityHash family are not suitable for cryptography.
+//
+// WARNING: This code has not been tested on big-endian platforms!
+// It is known to work well on little-endian platforms that have a small 
penalty
+// for unaligned reads, such as current Intel and AMD moderate-to-high-end 
CPUs.
+//
+// By the way, for some hash functions, given strings a and b, the hash
+// of a+b is easily derived from the hashes of a and b.  This property
+// doesn't hold for any hash functions in this file.
+
+#ifndef CITY_HASH_H_
+#define CITY_HASH_H_
+
+#include <stdlib.h>  // for size_t.
+#include <stdint.h>
+#include <utility>
+
+/** This is a version of CityHash that predates v1.0.3 algorithm change.
+  * Why we need exactly this version?
+  * Although hash values of CityHash are not recommended for storing 
persistently anywhere,
+  * it has already been used this way in ClickHouse:
+  * - for calculation of checksums of compressed chunks and for data parts;
+  * - this version of CityHash is exposed in cityHash64 function in ClickHouse 
SQL language;
+  * - and already used by many users for data ordering, sampling and sharding.
+  */
+namespace CityHash_v1_0_2
+{
+
+typedef uint8_t uint8;
+typedef uint32_t uint32;

Review Comment:
   warning: use 'using' instead of 'typedef' [modernize-use-using]
   
   ```suggestion
   using uint32 = uint32_t;
   ```
   



##########
be/src/olap/tablet_schema.h:
##########
@@ -129,6 +132,18 @@
     const IndexType index_type() const { return _index_type; }
     const vector<int32_t>& col_unique_ids() const { return _col_unique_ids; }
     const std::map<string, string>& properties() const { return _properties; }
+    int32_t get_gram_size() const {
+        if (_properties.count("gram_size"))
+            return std::stoi(_properties.at("gram_size"));
+        else
+            return 0;
+    }
+    int32_t get_gram_bf_size() const {
+        if (_properties.count("bf_size"))

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
           if (_properties.count("bf_size")) {
   ```
   
   be/src/olap/tablet_schema.h:143:
   ```diff
   -         else
   +         } else
   ```
   



##########
be/test/olap/itoken_extractor_test.cpp:
##########
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/itoken_extractor.h"
+
+#include <gtest/gtest.h>
+
+#include <string>
+
+#include "common/logging.h"
+#include "util/utf8_check.h"
+
+namespace doris {
+
+class TestITokenExtractor : public testing::Test {
+public:
+    virtual ~TestITokenExtractor() {}

Review Comment:
   warning: use '= default' to define a trivial destructor 
[modernize-use-equals-default]
   
   ```suggestion
       virtual ~TestITokenExtractor() = default;
   ```
   



##########
be/src/olap/tablet_schema.h:
##########
@@ -129,6 +132,18 @@
     const IndexType index_type() const { return _index_type; }
     const vector<int32_t>& col_unique_ids() const { return _col_unique_ids; }
     const std::map<string, string>& properties() const { return _properties; }
+    int32_t get_gram_size() const {
+        if (_properties.count("gram_size"))
+            return std::stoi(_properties.at("gram_size"));
+        else
+            return 0;

Review Comment:
   warning: statement should be inside braces 
[readability-braces-around-statements]
   
   ```suggestion
           else {
               return 0;
   }
   ```
   



##########
be/src/util/cityhash102/city.cc:
##########
@@ -0,0 +1,481 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to 
deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// This file provides CityHash64() and related functions.
+//
+// It's probably possible to create even faster hash functions by
+// writing a program that systematically explores some of the space of
+// possible hash functions, by using SIMD instructions, or by
+// compromising on hash quality.
+
+#include "config.h"
+#include "city.h"
+
+#include <algorithm>
+#include <string.h>  // for memcpy and memset
+
+using namespace std;
+
+
+#if !defined(WORDS_BIGENDIAN)
+
+#define uint32_in_expected_order(x) (x)
+#define uint64_in_expected_order(x) (x)
+
+#else
+
+#ifdef _MSC_VER
+#include <stdlib.h>
+#define bswap_32(x) _byteswap_ulong(x)
+#define bswap_64(x) _byteswap_uint64(x)
+
+#elif defined(__APPLE__)
+// Mac OS X / Darwin features
+#include <libkern/OSByteOrder.h>
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#else
+#include <byteswap.h>
+#endif
+
+#define uint32_in_expected_order(x) (bswap_32(x))
+#define uint64_in_expected_order(x) (bswap_64(x))
+
+#endif  // WORDS_BIGENDIAN
+
+#if !defined(LIKELY)
+#if HAVE_BUILTIN_EXPECT
+#define LIKELY(x) (__builtin_expect(!!(x), 1))
+#else
+#define LIKELY(x) (x)
+#endif
+#endif
+
+namespace CityHash_v1_0_2
+{
+
+static uint64 UNALIGNED_LOAD64(const char *p) {
+  uint64 result;
+  memcpy(&result, p, sizeof(result));
+  return result;
+}
+
+static uint32 UNALIGNED_LOAD32(const char *p) {
+  uint32 result;
+  memcpy(&result, p, sizeof(result));
+  return result;
+}
+
+static uint64 Fetch64(const char *p) {
+  return uint64_in_expected_order(UNALIGNED_LOAD64(p));
+}
+
+static uint32 Fetch32(const char *p) {
+  return uint32_in_expected_order(UNALIGNED_LOAD32(p));
+}
+
+// Some primes between 2^63 and 2^64 for various uses.
+static const uint64 k0 = 0xc3a5c85c97cb3127ULL;
+static const uint64 k1 = 0xb492b66fbe98f273ULL;
+static const uint64 k2 = 0x9ae16a3b2f90404fULL;
+static const uint64 k3 = 0xc949d7c7509e6557ULL;
+
+// Bitwise right rotate.  Normally this will compile to a single
+// instruction, especially if the shift is a manifest constant.
+static uint64 Rotate(uint64 val, int shift) {
+  // Avoid shifting by 64: doing so yields an undefined result.
+  return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
+}
+
+// Equivalent to Rotate(), but requires the second arg to be non-zero.
+// On x86-64, and probably others, it's possible for this to compile
+// to a single instruction if both args are already in registers.
+static uint64 RotateByAtLeast1(uint64 val, int shift) {
+  return (val >> shift) | (val << (64 - shift));
+}
+
+static uint64 ShiftMix(uint64 val) {
+  return val ^ (val >> 47);
+}
+
+static uint64 HashLen16(uint64 u, uint64 v) {
+  return Hash128to64(uint128(u, v));
+}
+
+static uint64 HashLen0to16(const char *s, size_t len) {
+  if (len > 8) {
+    uint64 a = Fetch64(s);
+    uint64 b = Fetch64(s + len - 8);
+    return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b;
+  }
+  if (len >= 4) {
+    uint64 a = Fetch32(s);
+    return HashLen16(len + (a << 3), Fetch32(s + len - 4));
+  }
+  if (len > 0) {
+    uint8 a = s[0];
+    uint8 b = s[len >> 1];
+    uint8 c = s[len - 1];
+    uint32 y = static_cast<uint32>(a) + (static_cast<uint32>(b) << 8);
+    uint32 z = len + (static_cast<uint32>(c) << 2);
+    return ShiftMix(y * k2 ^ z * k3) * k2;
+  }
+  return k2;
+}
+
+// This probably works well for 16-byte strings as well, but it may be overkill
+// in that case.
+static uint64 HashLen17to32(const char *s, size_t len) {
+  uint64 a = Fetch64(s) * k1;
+  uint64 b = Fetch64(s + 8);
+  uint64 c = Fetch64(s + len - 8) * k2;
+  uint64 d = Fetch64(s + len - 16) * k0;
+  return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,
+                   a + Rotate(b ^ k3, 20) - c + len);
+}
+
+// Return a 16-byte hash for 48 bytes.  Quick and dirty.
+// Callers do best to use "random-looking" values for a and b.
+static pair<uint64, uint64> WeakHashLen32WithSeeds(
+    uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) {
+  a += w;
+  b = Rotate(b + a + z, 21);
+  uint64 c = a;
+  a += x;
+  a += y;
+  b += Rotate(a, 44);
+  return make_pair(a + z, b + c);
+}
+
+// Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
+static pair<uint64, uint64> WeakHashLen32WithSeeds(
+    const char* s, uint64 a, uint64 b) {
+  return WeakHashLen32WithSeeds(Fetch64(s),
+                                Fetch64(s + 8),
+                                Fetch64(s + 16),
+                                Fetch64(s + 24),
+                                a,
+                                b);
+}
+
+// Return an 8-byte hash for 33 to 64 bytes.
+static uint64 HashLen33to64(const char *s, size_t len) {
+  uint64 z = Fetch64(s + 24);
+  uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0;
+  uint64 b = Rotate(a + z, 52);
+  uint64 c = Rotate(a, 37);
+  a += Fetch64(s + 8);
+  c += Rotate(a, 7);
+  a += Fetch64(s + 16);
+  uint64 vf = a + z;
+  uint64 vs = b + Rotate(a, 31) + c;
+  a = Fetch64(s + 16) + Fetch64(s + len - 32);
+  z = Fetch64(s + len - 8);
+  b = Rotate(a + z, 52);
+  c = Rotate(a, 37);
+  a += Fetch64(s + len - 24);
+  c += Rotate(a, 7);
+  a += Fetch64(s + len - 16);
+  uint64 wf = a + z;
+  uint64 ws = b + Rotate(a, 31) + c;
+  uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);
+  return ShiftMix(r * k0 + vs) * k2;
+}
+
+uint64 CityHash64(const char *s, size_t len) {
+  if (len <= 32) {
+    if (len <= 16) {
+      return HashLen0to16(s, len);
+    } else {
+      return HashLen17to32(s, len);
+    }
+  } else if (len <= 64) {
+    return HashLen33to64(s, len);
+  }
+
+  // For strings over 64 bytes we hash the end first, and then as we
+  // loop we keep 56 bytes of state: v, w, x, y, and z.
+  uint64 x = Fetch64(s);
+  uint64 y = Fetch64(s + len - 16) ^ k1;
+  uint64 z = Fetch64(s + len - 56) ^ k0;
+  pair<uint64, uint64> v = WeakHashLen32WithSeeds(s + len - 64, len, y);
+  pair<uint64, uint64> w = WeakHashLen32WithSeeds(s + len - 32, len * k1, k0);
+  z += ShiftMix(v.second) * k1;
+  x = Rotate(z + x, 39) * k1;
+  y = Rotate(y, 33) * k1;
+
+  // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
+  len = (len - 1) & ~static_cast<size_t>(63);
+  do {
+    x = Rotate(x + y + v.first + Fetch64(s + 16), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y ^= v.first;
+    z = Rotate(z ^ w.first, 33);
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y);
+    std::swap(z, x);
+    s += 64;
+    len -= 64;
+  } while (len != 0);
+  return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
+                   HashLen16(v.second, w.second) + x);
+}
+
+uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) {
+  return CityHash64WithSeeds(s, len, k2, seed);
+}
+
+uint64 CityHash64WithSeeds(const char *s, size_t len,
+                           uint64 seed0, uint64 seed1) {
+  return HashLen16(CityHash64(s, len) - seed0, seed1);
+}
+
+// A subroutine for CityHash128().  Returns a decent 128-bit hash for strings
+// of any length representable in ssize_t.  Based on City and Murmur.
+static uint128 CityMurmur(const char *s, size_t len, uint128 seed) {
+  uint64 a = Uint128Low64(seed);
+  uint64 b = Uint128High64(seed);
+  uint64 c = 0;
+  uint64 d = 0;
+  ssize_t l = len - 16;
+  if (l <= 0) {  // len <= 16
+    a = ShiftMix(a * k1) * k1;
+    c = b * k1 + HashLen0to16(s, len);
+    d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c));
+  } else {  // len > 16
+    c = HashLen16(Fetch64(s + len - 8) + k1, a);
+    d = HashLen16(b + len, c + Fetch64(s + len - 16));
+    a += d;
+    do {
+      a ^= ShiftMix(Fetch64(s) * k1) * k1;
+      a *= k1;
+      b ^= a;
+      c ^= ShiftMix(Fetch64(s + 8) * k1) * k1;
+      c *= k1;
+      d ^= c;
+      s += 16;
+      l -= 16;
+    } while (l > 0);
+  }
+  a = HashLen16(a, c);
+  b = HashLen16(d, b);
+  return uint128(a ^ b, HashLen16(b, a));
+}
+
+uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) {
+  if (len < 128) {
+    return CityMurmur(s, len, seed);
+  }
+
+  // We expect len >= 128 to be the common case.  Keep 56 bytes of state:
+  // v, w, x, y, and z.
+  pair<uint64, uint64> v, w;
+  uint64 x = Uint128Low64(seed);
+  uint64 y = Uint128High64(seed);
+  uint64 z = len * k1;
+  v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s);
+  v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8);
+  w.first = Rotate(y + z, 35) * k1 + x;
+  w.second = Rotate(x + Fetch64(s + 88), 53) * k1;
+
+  // This is the same inner loop as CityHash64(), manually unrolled.
+  do {
+    x = Rotate(x + y + v.first + Fetch64(s + 16), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y ^= v.first;
+    z = Rotate(z ^ w.first, 33);
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y);
+    std::swap(z, x);
+    s += 64;
+    x = Rotate(x + y + v.first + Fetch64(s + 16), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y ^= v.first;
+    z = Rotate(z ^ w.first, 33);
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y);
+    std::swap(z, x);
+    s += 64;
+    len -= 128;
+  } while (LIKELY(len >= 128));
+  y += Rotate(w.first, 37) * k0 + z;
+  x += Rotate(v.first + z, 49) * k0;
+  // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.
+  for (size_t tail_done = 0; tail_done < len; ) {
+    tail_done += 32;
+    y = Rotate(y - x, 42) * k0 + v.second;
+    w.first += Fetch64(s + len - tail_done + 16);
+    x = Rotate(x, 49) * k0 + w.first;
+    w.first += v.first;
+    v = WeakHashLen32WithSeeds(s + len - tail_done, v.first, v.second);
+  }
+  // At this point our 48 bytes of state should contain more than
+  // enough information for a strong 128-bit hash.  We use two
+  // different 48-byte-to-8-byte hashes to get a 16-byte final result.
+  x = HashLen16(x, v.first);
+  y = HashLen16(y, w.first);
+  return uint128(HashLen16(x + v.second, w.second) + y,
+                 HashLen16(x + w.second, y + v.second));
+}
+
+uint128 CityHash128(const char *s, size_t len) {
+  if (len >= 16) {
+    return CityHash128WithSeed(s + 16,
+                               len - 16,
+                               uint128(Fetch64(s) ^ k3,
+                                       Fetch64(s + 8)));
+  } else if (len >= 8) {
+    return CityHash128WithSeed(NULL,

Review Comment:
   warning: use nullptr [modernize-use-nullptr]
   
   ```suggestion
       return CityHash128WithSeed(nullptr,
   ```
   



##########
be/src/util/cityhash102/city.h:
##########
@@ -0,0 +1,104 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to 
deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// This file provides a few functions for hashing strings. On x86-64
+// hardware in 2011, CityHash64() is faster than other high-quality
+// hash functions, such as Murmur.  This is largely due to higher
+// instruction-level parallelism.  CityHash64() and CityHash128() also perform
+// well on hash-quality tests.
+//
+// CityHash128() is optimized for relatively long strings and returns
+// a 128-bit hash.  For strings more than about 2000 bytes it can be
+// faster than CityHash64().
+//
+// Functions in the CityHash family are not suitable for cryptography.
+//
+// WARNING: This code has not been tested on big-endian platforms!
+// It is known to work well on little-endian platforms that have a small 
penalty
+// for unaligned reads, such as current Intel and AMD moderate-to-high-end 
CPUs.
+//
+// By the way, for some hash functions, given strings a and b, the hash
+// of a+b is easily derived from the hashes of a and b.  This property
+// doesn't hold for any hash functions in this file.
+
+#ifndef CITY_HASH_H_
+#define CITY_HASH_H_
+
+#include <stdlib.h>  // for size_t.
+#include <stdint.h>
+#include <utility>
+
+/** This is a version of CityHash that predates v1.0.3 algorithm change.
+  * Why we need exactly this version?
+  * Although hash values of CityHash are not recommended for storing 
persistently anywhere,
+  * it has already been used this way in ClickHouse:
+  * - for calculation of checksums of compressed chunks and for data parts;
+  * - this version of CityHash is exposed in cityHash64 function in ClickHouse 
SQL language;
+  * - and already used by many users for data ordering, sampling and sharding.
+  */
+namespace CityHash_v1_0_2
+{
+
+typedef uint8_t uint8;

Review Comment:
   warning: use 'using' instead of 'typedef' [modernize-use-using]
   
   ```suggestion
   using uint8 = uint8_t;
   ```
   



##########
be/src/util/cityhash102/city.h:
##########
@@ -0,0 +1,104 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to 
deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// This file provides a few functions for hashing strings. On x86-64
+// hardware in 2011, CityHash64() is faster than other high-quality
+// hash functions, such as Murmur.  This is largely due to higher
+// instruction-level parallelism.  CityHash64() and CityHash128() also perform
+// well on hash-quality tests.
+//
+// CityHash128() is optimized for relatively long strings and returns
+// a 128-bit hash.  For strings more than about 2000 bytes it can be
+// faster than CityHash64().
+//
+// Functions in the CityHash family are not suitable for cryptography.
+//
+// WARNING: This code has not been tested on big-endian platforms!
+// It is known to work well on little-endian platforms that have a small 
penalty
+// for unaligned reads, such as current Intel and AMD moderate-to-high-end 
CPUs.
+//
+// By the way, for some hash functions, given strings a and b, the hash
+// of a+b is easily derived from the hashes of a and b.  This property
+// doesn't hold for any hash functions in this file.
+
+#ifndef CITY_HASH_H_
+#define CITY_HASH_H_
+
+#include <stdlib.h>  // for size_t.
+#include <stdint.h>
+#include <utility>
+
+/** This is a version of CityHash that predates v1.0.3 algorithm change.
+  * Why we need exactly this version?
+  * Although hash values of CityHash are not recommended for storing 
persistently anywhere,
+  * it has already been used this way in ClickHouse:
+  * - for calculation of checksums of compressed chunks and for data parts;
+  * - this version of CityHash is exposed in cityHash64 function in ClickHouse 
SQL language;
+  * - and already used by many users for data ordering, sampling and sharding.
+  */
+namespace CityHash_v1_0_2
+{
+
+typedef uint8_t uint8;
+typedef uint32_t uint32;
+typedef uint64_t uint64;

Review Comment:
   warning: use 'using' instead of 'typedef' [modernize-use-using]
   
   ```suggestion
   using uint64 = uint64_t;
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [doris] github-actions[bot] commented on a diff in pull request #11579: [Feature](NGram BloomFilter Index) add new ngram bloom filter index to speed up like query

Reply via email to