This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 3dc3e81734c [Improvement](datatype) Update Parser for IPv4/v6 data 
types (#29044)
3dc3e81734c is described below

commit 3dc3e81734c830e4790485a4d88acd5a3e83aae2
Author: yangshijie <[email protected]>
AuthorDate: Thu Dec 28 11:00:38 2023 +0800

    [Improvement](datatype) Update Parser for IPv4/v6 data types (#29044)
    
    Transforming from parsing std:: string to parsing char * to accelerate the 
parsing of ipv4/v6 data types.
---
 be/src/util/types.h             |   5 -
 be/src/vec/common/format_ip.h   | 162 +++++++++++++++++++++++++++-
 be/src/vec/runtime/ipv4_value.h |  89 ++++------------
 be/src/vec/runtime/ipv6_value.h | 228 ++++------------------------------------
 4 files changed, 201 insertions(+), 283 deletions(-)

diff --git a/be/src/util/types.h b/be/src/util/types.h
index b04c0644f6f..7688dd60390 100644
--- a/be/src/util/types.h
+++ b/be/src/util/types.h
@@ -44,9 +44,4 @@ inline int128_t get_int128_from_unalign(const void* address) {
     return value;
 }
 
-inline uint128_t get_uint128_from_unalign(const void* address) {
-    uint128_t value = 0;
-    memcpy(&value, address, sizeof(uint128_t));
-    return value;
-}
 } // namespace doris
diff --git a/be/src/vec/common/format_ip.h b/be/src/vec/common/format_ip.h
index ab9ac4b6595..58322ecd9fc 100644
--- a/be/src/vec/common/format_ip.h
+++ b/be/src/vec/common/format_ip.h
@@ -20,6 +20,7 @@
 
 #pragma once
 
+#include <vec/common/hex.h>
 #include <vec/common/string_utils/string_utils.h>
 
 #include <algorithm>
@@ -34,7 +35,7 @@ constexpr size_t IPV4_MAX_TEXT_LENGTH = 15; /// Does not 
count tail zero byte.
 constexpr size_t IPV6_MAX_TEXT_LENGTH = 39;
 constexpr size_t IPV4_MIN_NUM_VALUE = 0;          //num value of '0.0.0.0'
 constexpr size_t IPV4_MAX_NUM_VALUE = 4294967295; //num value of 
'255.255.255.255'
-constexpr int IPV4_MAX_OCTET_VALUE = 255;         //max vulue of octet
+constexpr int IPV4_MAX_OCTET_VALUE = 255;         //max value of octet
 constexpr size_t IPV4_OCTET_BITS = 8;
 constexpr size_t DECIMAL_BASE = 10;
 constexpr size_t IPV6_BINARY_LENGTH = 16;
@@ -198,4 +199,163 @@ inline bool parseIPv4whole(const char* src, unsigned 
char* dst) {
   */
 void formatIPv6(const unsigned char* src, char*& dst, uint8_t 
zeroed_tail_bytes_count = 0);
 
+/** Unsafe (no bounds-checking for src nor dst), optimized version of parsing 
IPv6 string.
+*
+* Parses the input string `src` and stores binary big-endian value into buffer 
pointed by `dst`,
+* which should be long enough. In case of failure zeroes IPV6_BINARY_LENGTH 
bytes of buffer pointed by `dst`.
+*
+* WARNING - this function is adapted to work with ReadBuffer, where src is the 
position reference (ReadBuffer::position())
+*           and eof is the ReadBuffer::eof() - therefore algorithm below does 
not rely on buffer's continuity.
+*           To parse strings use overloads below.
+*
+* @param src         - iterator (reference to pointer) over input string - 
warning - continuity is not guaranteed.
+* @param eof         - function returning true if iterator riched the end - 
warning - can break iterator's continuity.
+* @param dst         - where to put output bytes, expected to be non-null and 
at IPV6_BINARY_LENGTH-long.
+* @param first_block - preparsed first block
+* @return            - true if parsed successfully, false otherwise.
+*/
+template <typename T, typename EOFfunction>
+    requires(std::is_same<typename std::remove_cv<T>::type, char>::value)
+inline bool parseIPv6(T*& src, EOFfunction eof, unsigned char* dst, int32_t 
first_block = -1) {
+    const auto clear_dst = [dst]() {
+        std::memset(dst, '\0', IPV6_BINARY_LENGTH);
+        return false;
+    };
+
+    if (src == nullptr || eof()) return clear_dst();
+
+    int groups = 0;            /// number of parsed groups
+    unsigned char* iter = dst; /// iterator over dst buffer
+    unsigned char* zptr =
+            nullptr; /// pointer into dst buffer array where all-zeroes block 
("::") is started
+
+    std::memset(dst, '\0', IPV6_BINARY_LENGTH);
+
+    if (first_block >= 0) {
+        *iter++ = static_cast<unsigned char>((first_block >> 8) & 0xffu);
+        *iter++ = static_cast<unsigned char>(first_block & 0xffu);
+        if (*src == ':') {
+            zptr = iter;
+            ++src;
+        }
+        ++groups;
+    }
+
+    bool group_start = true;
+
+    while (!eof() && groups < 8) {
+        if (*src == ':') {
+            ++src;
+            if (eof()) /// trailing colon is not allowed
+                return clear_dst();
+
+            group_start = true;
+
+            if (*src == ':') {
+                if (zptr != nullptr) /// multiple all-zeroes blocks are not 
allowed
+                    return clear_dst();
+                zptr = iter;
+                ++src;
+                continue;
+            }
+            if (groups == 0) /// leading colon is not allowed
+                return clear_dst();
+        }
+
+        if (*src == '.') /// mixed IPv4 parsing
+        {
+            if (groups <= 1 && zptr == nullptr) /// IPv4 block can't be the 
first
+                return clear_dst();
+
+            if (group_start) /// first octet of IPv4 should be already parsed 
as an IPv6 group
+                return clear_dst();
+
+            ++src;
+            if (eof()) return clear_dst();
+
+            /// last parsed group should be reinterpreted as a decimal value - 
it's the first octet of IPv4
+            --groups;
+            iter -= 2;
+
+            UInt16 num = 0;
+            for (int i = 0; i < 2; ++i) {
+                unsigned char first = (iter[i] >> 4) & 0x0fu;
+                unsigned char second = iter[i] & 0x0fu;
+                if (first > 9 || second > 9) return clear_dst();
+                (num *= 100) += first * 10 + second;
+            }
+            if (num > 255) return clear_dst();
+
+            /// parse IPv4 with known first octet
+            if (!parseIPv4(src, eof, iter, num)) return clear_dst();
+
+            if constexpr (std::endian::native == std::endian::little)
+                std::reverse(iter, iter + IPV4_BINARY_LENGTH);
+
+            iter += 4;
+            groups += 2;
+            break; /// IPv4 block is the last - end of parsing
+        }
+
+        if (!group_start) /// end of parsing
+            break;
+        group_start = false;
+
+        UInt16 val = 0;  /// current decoded group
+        int xdigits = 0; /// number of decoded hex digits in current group
+
+        for (; !eof() && xdigits < 4; ++src, ++xdigits) {
+            UInt8 num = unhex(*src);
+            if (num == 0xFF) break;
+            (val <<= 4) |= num;
+        }
+
+        if (xdigits == 0) /// end of parsing
+            break;
+
+        *iter++ = static_cast<unsigned char>((val >> 8) & 0xffu);
+        *iter++ = static_cast<unsigned char>(val & 0xffu);
+        ++groups;
+    }
+
+    /// either all 8 groups or all-zeroes block should be present
+    if (groups < 8 && zptr == nullptr) return clear_dst();
+
+    if (zptr != nullptr) /// process all-zeroes block
+    {
+        size_t msize = iter - zptr;
+        std::memmove(dst + IPV6_BINARY_LENGTH - msize, zptr, msize);
+        std::memset(zptr, '\0', IPV6_BINARY_LENGTH - (iter - dst));
+    }
+
+    return true;
+}
+
+/// returns pointer to the right after parsed sequence or null on failed 
parsing
+inline const char* parseIPv6(const char* src, const char* end, unsigned char* 
dst) {
+    if (parseIPv6(
+                src, [&src, end]() { return src == end; }, dst))
+        return src;
+    return nullptr;
+}
+
+/// returns true if whole buffer was parsed successfully
+inline bool parseIPv6whole(const char* src, const char* end, unsigned char* 
dst) {
+    return parseIPv6(src, end, dst) == end;
+}
+
+/// returns pointer to the right after parsed sequence or null on failed 
parsing
+inline const char* parseIPv6(const char* src, unsigned char* dst) {
+    if (parseIPv6(
+                src, []() { return false; }, dst))
+        return src;
+    return nullptr;
+}
+
+/// returns true if whole null-terminated string was parsed successfully
+inline bool parseIPv6whole(const char* src, unsigned char* dst) {
+    const char* end = parseIPv6(src, dst);
+    return end != nullptr && *end == '\0';
+}
+
 } // namespace doris::vectorized
diff --git a/be/src/vec/runtime/ipv4_value.h b/be/src/vec/runtime/ipv4_value.h
index fb42ed66f9c..9304aa2c18a 100644
--- a/be/src/vec/runtime/ipv4_value.h
+++ b/be/src/vec/runtime/ipv4_value.h
@@ -17,14 +17,13 @@
 
 #pragma once
 
-#include <stdint.h>
-
 #include <algorithm>
 #include <regex>
 #include <sstream>
 #include <string>
 
 #include "util/string_parser.hpp"
+#include "vec/common/format_ip.h"
 
 namespace doris {
 
@@ -34,87 +33,41 @@ public:
 
     explicit IPv4Value(vectorized::IPv4 ipv4) { _value = ipv4; }
 
-    explicit IPv4Value(std::string ipv4) {}
-
-    [[nodiscard]] const vectorized::IPv4& value() const { return _value; }
+    const vectorized::IPv4& value() const { return _value; }
 
     vectorized::IPv4& value() { return _value; }
 
     void set_value(vectorized::IPv4 ipv4) { _value = ipv4; }
 
-    bool from_string(std::string ipv4) { return from_string(_value, ipv4); }
-
-    [[nodiscard]] std::string to_string() const { return to_string(_value); }
+    bool from_string(const std::string& ipv4_str) { return from_string(_value, 
ipv4_str); }
 
-    static bool from_string(vectorized::IPv4& value, std::string ipv4) {
-        remove_ipv4_space(ipv4);
+    std::string to_string() const { return to_string(_value); }
 
-        // shortest ipv4 string is `0.0.0.0` whose length is 7
-        if (ipv4.size() < 7 || !is_valid_string(ipv4)) {
+    static bool from_string(vectorized::IPv4& value, const std::string& 
ipv4_str) {
+        if (ipv4_str.empty()) {
             return false;
         }
-
-        vectorized::IPv4 octets[4] = {0};
-        std::istringstream iss(ipv4);
-        std::string octet;
-        uint8_t octet_index = 0;
-
-        while (getline(iss, octet, '.')) {
-            if (octet_index >= 4) {
-                return false;
-            }
-
-            StringParser::ParseResult result;
-            vectorized::IPv4 val = 
StringParser::string_to_unsigned_int<vectorized::IPv4>(
-                    octet.c_str(), octet.length(), &result);
-            if (result != StringParser::PARSE_SUCCESS || val > 255) {
-                return false;
-            }
-
-            octets[octet_index++] = val;
-        }
-
-        if (octet_index != 4) {
+        int64_t parse_value;
+        const char* src = ipv4_str.c_str();
+        const char* end = ipv4_str.c_str() + ipv4_str.size() - 1;
+        while (std::isspace(*src)) ++src;
+        while (std::isspace(*end)) --end;
+        if (!vectorized::parseIPv4whole(src, ++end,
+                                        reinterpret_cast<unsigned 
char*>(&parse_value))) {
             return false;
         }
-
-        value = (octets[0] << 24) | (octets[1] << 16) | (octets[2] << 8) | 
octets[3];
+        value = static_cast<vectorized::IPv4>(parse_value);
         return true;
     }
 
     static std::string to_string(vectorized::IPv4 value) {
-        std::stringstream ss;
-        ss << ((value >> 24) & 0xFF) << '.' << ((value >> 16) & 0xFF) << '.'
-           << ((value >> 8) & 0xFF) << '.' << (value & 0xFF);
-        return ss.str();
-    }
-
-    static void remove_ipv4_space(std::string& ipv4) {
-        if (ipv4.empty()) {
-            return;
-        }
-
-        std::string special_chars = "\r\n\t ";
-
-        size_t pos = ipv4.find_first_not_of(special_chars);
-        if (pos != std::string::npos) {
-            ipv4.erase(0, pos);
-        }
-
-        pos = ipv4.find_last_not_of(special_chars);
-        if (pos != std::string::npos) {
-            ipv4.erase(pos + 1);
-        }
-    }
-
-    static bool is_valid_string(std::string ipv4) {
-        static std::regex IPV4_STD_REGEX(
-                
"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-"
-                "9]?)$");
-        if (ipv4.size() > 15 || !std::regex_match(ipv4, IPV4_STD_REGEX)) {
-            return false;
-        }
-        return true;
+        char buf[IPV4_MAX_TEXT_LENGTH + 1];
+        char* start = buf;
+        char* end = buf;
+        const auto* src = reinterpret_cast<const unsigned char*>(&value);
+        vectorized::formatIPv4(src, end);
+        size_t len = end - start;
+        return {buf, len};
     }
 
 private:
diff --git a/be/src/vec/runtime/ipv6_value.h b/be/src/vec/runtime/ipv6_value.h
index 8839ab4e2b0..8aaa8a26b69 100644
--- a/be/src/vec/runtime/ipv6_value.h
+++ b/be/src/vec/runtime/ipv6_value.h
@@ -17,12 +17,11 @@
 
 #pragma once
 
-#include <stdint.h>
-
 #include <regex>
 #include <sstream>
 #include <string>
 
+#include "vec/common/format_ip.h"
 #include "vec/core/types.h"
 #include "vec/data_types/data_type.h"
 #include "vec/data_types/data_type_number_base.h"
@@ -35,224 +34,35 @@ public:
 
     explicit IPv6Value(vectorized::IPv6 ipv6) { _value = ipv6; }
 
-    [[nodiscard]] const vectorized::IPv6& value() const { return _value; }
+    const vectorized::IPv6& value() const { return _value; }
 
     vectorized::IPv6& value() { return _value; }
 
     void set_value(vectorized::IPv6 ipv6) { _value = ipv6; }
 
-    bool from_string(std::string ipv6) { return from_string(_value, ipv6); }
-
-    bool from_binary_string(std::string ipv6_binary) {
-        return from_binary_string(_value, ipv6_binary);
-    }
-
-    static bool from_string(vectorized::IPv6& x, std::string ipv6) {
-        remove_ipv6_space(ipv6);
-
-        if (ipv6.empty() || !is_valid_string(ipv6)) {
-            return false;
-        }
-
-        std::transform(ipv6.begin(), ipv6.end(), ipv6.begin(),
-                       [](unsigned char ch) { return std::tolower(ch); });
-        std::istringstream iss(ipv6);
-        std::string field;
-        uint16_t fields[8] = {0};
-        uint8_t zero_index = 0;
-        uint8_t num_field = 0;
-        uint8_t right_field_num = 0;
-
-        while (num_field < 8) {
-            if (!getline(iss, field, ':')) {
-                break;
-            }
-
-            if (field.empty()) {
-                zero_index = num_field;
-                fields[num_field++] = 0;
-            } else {
-                try {
-                    if (field.size() > 4 || field > "ffff") {
-                        return false;
-                    }
-
-                    fields[num_field++] = std::stoi(field, nullptr, 16);
-                } catch (const std::exception& /*e*/) {
-                    return false;
-                }
-            }
-        }
-
-        if (zero_index != 0) {
-            right_field_num = num_field - zero_index - 1;
-
-            for (uint8_t i = 7; i > 7 - right_field_num; --i) {
-                fields[i] = fields[zero_index + right_field_num + i - 7];
-                fields[zero_index + right_field_num + i - 7] = 0;
-            }
-        }
-
-        uint64_t high = (static_cast<uint64_t>(fields[0]) << 48) |
-                        (static_cast<uint64_t>(fields[1]) << 32) |
-                        (static_cast<uint64_t>(fields[2]) << 16) | 
static_cast<uint64_t>(fields[3]);
-        uint64_t low = (static_cast<uint64_t>(fields[4]) << 48) |
-                       (static_cast<uint64_t>(fields[5]) << 32) |
-                       (static_cast<uint64_t>(fields[6]) << 16) | 
static_cast<uint64_t>(fields[7]);
-
-        x = static_cast<vectorized::IPv6>(high) << 64 | low;
-        return true;
-    }
+    bool from_string(const std::string& ipv6_str) { return from_string(_value, 
ipv6_str); }
 
-    static bool from_binary_string(vectorized::IPv6& x, std::string 
ipv6_binary_str) {
-        // Accepts a FixedString(16) value containing the IPv6 address in 
binary format
-        if (ipv6_binary_str.size() != 16) {
+    static bool from_string(vectorized::IPv6& value, const std::string& 
ipv6_str) {
+        if (ipv6_str.empty()) {
             return false;
         }
-
-        uint64_t high = 0;
-        uint64_t low = 0;
-
-        const uint8_t* ipv6_binary = reinterpret_cast<const 
uint8_t*>(ipv6_binary_str.c_str());
-
-        for (int i = 0; i < 8; ++i) {
-            high |= (static_cast<uint64_t>(ipv6_binary[i]) << (56 - i * 8));
-        }
-
-        for (int i = 8; i < 16; ++i) {
-            low |= (static_cast<uint64_t>(ipv6_binary[i]) << (56 - (i - 8) * 
8));
-        }
-
-        x = static_cast<vectorized::IPv6>(high) << 64 | low;
-        return true;
-    }
-
-    [[nodiscard]] std::string to_string() const { return to_string(_value); }
-
-    static std::string to_string(vectorized::IPv6 x) {
-        // "0000:0000:0000:0000:0000:0000:0000:0000"
-        if (x == 0) {
-            return "::";
-        }
-
-        uint64_t low = static_cast<uint64_t>(x);
-        uint64_t high = static_cast<uint64_t>(x >> 64);
-
-        uint16_t fields[8] = {static_cast<uint16_t>((high >> 48) & 0xFFFF),
-                              static_cast<uint16_t>((high >> 32) & 0xFFFF),
-                              static_cast<uint16_t>((high >> 16) & 0xFFFF),
-                              static_cast<uint16_t>(high & 0xFFFF),
-                              static_cast<uint16_t>((low >> 48) & 0xFFFF),
-                              static_cast<uint16_t>((low >> 32) & 0xFFFF),
-                              static_cast<uint16_t>((low >> 16) & 0xFFFF),
-                              static_cast<uint16_t>(low & 0xFFFF)};
-
-        uint8_t zero_start = 0, zero_end = 0;
-
-        while (zero_start < 8 && zero_end < 8) {
-            if (fields[zero_start] != 0) {
-                zero_start++;
-                zero_end = zero_start;
-                continue;
-            }
-
-            while (zero_end < 7 && fields[zero_end + 1] == 0) {
-                zero_end++;
-            }
-
-            if (zero_end > zero_start) {
-                break;
-            }
-
-            zero_start++;
-            zero_end = zero_start;
-        }
-
-        std::stringstream ss;
-
-        if (zero_start == zero_end) {
-            for (uint8_t i = 0; i < 7; ++i) {
-                ss << std::hex << fields[i] << ":";
-            }
-            ss << std::hex << fields[7];
-        } else {
-            for (uint8_t i = 0; i < zero_start; ++i) {
-                ss << std::hex << fields[i] << ":";
-            }
-
-            if (zero_end == 7) {
-                ss << ":";
-            } else {
-                for (uint8_t j = zero_end + 1; j < 8; ++j) {
-                    ss << std::hex << ":" << fields[j];
-                }
-            }
-        }
-
-        return ss.str();
-    }
-
-    [[nodiscard]] std::string to_binary_string() const { return 
to_binary_string(_value); }
-
-    static std::string to_binary_string(vectorized::IPv6 x) {
-        uint64_t low = static_cast<uint64_t>(x);
-        uint64_t high = static_cast<uint64_t>(x >> 64);
-
-        uint8_t fields[16] = {static_cast<uint8_t>((high >> 56) & 0xFF),
-                              static_cast<uint8_t>((high >> 48) & 0xFF),
-                              static_cast<uint8_t>((high >> 40) & 0xFF),
-                              static_cast<uint8_t>((high >> 32) & 0xFF),
-                              static_cast<uint8_t>((high >> 24) & 0xFF),
-                              static_cast<uint8_t>((high >> 16) & 0xFF),
-                              static_cast<uint8_t>((high >> 8) & 0xFF),
-                              static_cast<uint8_t>(high & 0xFF),
-                              static_cast<uint8_t>((low >> 56) & 0xFF),
-                              static_cast<uint8_t>((low >> 48) & 0xFF),
-                              static_cast<uint8_t>((low >> 40) & 0xFF),
-                              static_cast<uint8_t>((low >> 32) & 0xFF),
-                              static_cast<uint8_t>((low >> 24) & 0xFF),
-                              static_cast<uint8_t>((low >> 16) & 0xFF),
-                              static_cast<uint8_t>((low >> 8) & 0xFF),
-                              static_cast<uint8_t>(low & 0xFF)};
-
-        std::stringstream ss;
-
-        for (int i = 0; i < 16; ++i) {
-            ss << (char)fields[i];
-        }
-
-        return ss.str();
+        const char* src = ipv6_str.c_str();
+        const char* end = ipv6_str.c_str() + ipv6_str.size() - 1;
+        while (std::isspace(*src)) ++src;
+        while (std::isspace(*end)) --end;
+        return vectorized::parseIPv6whole(src, ++end, 
reinterpret_cast<unsigned char*>(&value));
     }
 
-    static void remove_ipv6_space(std::string& ipv6) {
-        if (ipv6.empty()) {
-            return;
-        }
-
-        std::string special_chars = "\r\n\t ";
+    std::string to_string() const { return to_string(_value); }
 
-        size_t pos = ipv6.find_first_not_of(special_chars);
-        if (pos != std::string::npos) {
-            ipv6.erase(0, pos);
-        }
-
-        pos = ipv6.find_last_not_of(special_chars);
-        if (pos != std::string::npos) {
-            ipv6.erase(pos + 1);
-        }
-    }
-
-    static bool is_valid_string(std::string ipv6) {
-        static std::regex 
IPV6_STD_REGEX("^([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$");
-        static std::regex IPV6_COMPRESS_REGEX(
-                
"^(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)::((([0-9A-Fa-f]{1,4}:)*[0-9A-Fa-f]{1,4}"
-                ")?)$");
-
-        if (ipv6.size() > 39 || !(std::regex_match(ipv6, IPV6_STD_REGEX) ||
-                                  std::regex_match(ipv6, 
IPV6_COMPRESS_REGEX))) {
-            return false;
-        }
-        return true;
+    static std::string to_string(vectorized::IPv6 value) {
+        char buf[IPV6_MAX_TEXT_LENGTH + 1];
+        char* start = buf;
+        char* end = buf;
+        const auto* src = reinterpret_cast<const unsigned char*>(&value);
+        vectorized::formatIPv6(src, end);
+        size_t len = end - start;
+        return {buf, len};
     }
 
 private:


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to