This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 3dc3e81734c [Improvement](datatype) Update Parser for IPv4/v6 data
types (#29044)
3dc3e81734c is described below
commit 3dc3e81734c830e4790485a4d88acd5a3e83aae2
Author: yangshijie <[email protected]>
AuthorDate: Thu Dec 28 11:00:38 2023 +0800
[Improvement](datatype) Update Parser for IPv4/v6 data types (#29044)
Transforming from parsing std:: string to parsing char * to accelerate the
parsing of ipv4/v6 data types.
---
be/src/util/types.h | 5 -
be/src/vec/common/format_ip.h | 162 +++++++++++++++++++++++++++-
be/src/vec/runtime/ipv4_value.h | 89 ++++------------
be/src/vec/runtime/ipv6_value.h | 228 ++++------------------------------------
4 files changed, 201 insertions(+), 283 deletions(-)
diff --git a/be/src/util/types.h b/be/src/util/types.h
index b04c0644f6f..7688dd60390 100644
--- a/be/src/util/types.h
+++ b/be/src/util/types.h
@@ -44,9 +44,4 @@ inline int128_t get_int128_from_unalign(const void* address) {
return value;
}
-inline uint128_t get_uint128_from_unalign(const void* address) {
- uint128_t value = 0;
- memcpy(&value, address, sizeof(uint128_t));
- return value;
-}
} // namespace doris
diff --git a/be/src/vec/common/format_ip.h b/be/src/vec/common/format_ip.h
index ab9ac4b6595..58322ecd9fc 100644
--- a/be/src/vec/common/format_ip.h
+++ b/be/src/vec/common/format_ip.h
@@ -20,6 +20,7 @@
#pragma once
+#include <vec/common/hex.h>
#include <vec/common/string_utils/string_utils.h>
#include <algorithm>
@@ -34,7 +35,7 @@ constexpr size_t IPV4_MAX_TEXT_LENGTH = 15; /// Does not
count tail zero byte.
constexpr size_t IPV6_MAX_TEXT_LENGTH = 39;
constexpr size_t IPV4_MIN_NUM_VALUE = 0; //num value of '0.0.0.0'
constexpr size_t IPV4_MAX_NUM_VALUE = 4294967295; //num value of
'255.255.255.255'
-constexpr int IPV4_MAX_OCTET_VALUE = 255; //max vulue of octet
+constexpr int IPV4_MAX_OCTET_VALUE = 255; //max value of octet
constexpr size_t IPV4_OCTET_BITS = 8;
constexpr size_t DECIMAL_BASE = 10;
constexpr size_t IPV6_BINARY_LENGTH = 16;
@@ -198,4 +199,163 @@ inline bool parseIPv4whole(const char* src, unsigned
char* dst) {
*/
void formatIPv6(const unsigned char* src, char*& dst, uint8_t
zeroed_tail_bytes_count = 0);
+/** Unsafe (no bounds-checking for src nor dst), optimized version of parsing
IPv6 string.
+*
+* Parses the input string `src` and stores binary big-endian value into buffer
pointed by `dst`,
+* which should be long enough. In case of failure zeroes IPV6_BINARY_LENGTH
bytes of buffer pointed by `dst`.
+*
+* WARNING - this function is adapted to work with ReadBuffer, where src is the
position reference (ReadBuffer::position())
+* and eof is the ReadBuffer::eof() - therefore algorithm below does
not rely on buffer's continuity.
+* To parse strings use overloads below.
+*
+* @param src - iterator (reference to pointer) over input string -
warning - continuity is not guaranteed.
+* @param eof - function returning true if iterator riched the end -
warning - can break iterator's continuity.
+* @param dst - where to put output bytes, expected to be non-null and
at IPV6_BINARY_LENGTH-long.
+* @param first_block - preparsed first block
+* @return - true if parsed successfully, false otherwise.
+*/
+template <typename T, typename EOFfunction>
+ requires(std::is_same<typename std::remove_cv<T>::type, char>::value)
+inline bool parseIPv6(T*& src, EOFfunction eof, unsigned char* dst, int32_t
first_block = -1) {
+ const auto clear_dst = [dst]() {
+ std::memset(dst, '\0', IPV6_BINARY_LENGTH);
+ return false;
+ };
+
+ if (src == nullptr || eof()) return clear_dst();
+
+ int groups = 0; /// number of parsed groups
+ unsigned char* iter = dst; /// iterator over dst buffer
+ unsigned char* zptr =
+ nullptr; /// pointer into dst buffer array where all-zeroes block
("::") is started
+
+ std::memset(dst, '\0', IPV6_BINARY_LENGTH);
+
+ if (first_block >= 0) {
+ *iter++ = static_cast<unsigned char>((first_block >> 8) & 0xffu);
+ *iter++ = static_cast<unsigned char>(first_block & 0xffu);
+ if (*src == ':') {
+ zptr = iter;
+ ++src;
+ }
+ ++groups;
+ }
+
+ bool group_start = true;
+
+ while (!eof() && groups < 8) {
+ if (*src == ':') {
+ ++src;
+ if (eof()) /// trailing colon is not allowed
+ return clear_dst();
+
+ group_start = true;
+
+ if (*src == ':') {
+ if (zptr != nullptr) /// multiple all-zeroes blocks are not
allowed
+ return clear_dst();
+ zptr = iter;
+ ++src;
+ continue;
+ }
+ if (groups == 0) /// leading colon is not allowed
+ return clear_dst();
+ }
+
+ if (*src == '.') /// mixed IPv4 parsing
+ {
+ if (groups <= 1 && zptr == nullptr) /// IPv4 block can't be the
first
+ return clear_dst();
+
+ if (group_start) /// first octet of IPv4 should be already parsed
as an IPv6 group
+ return clear_dst();
+
+ ++src;
+ if (eof()) return clear_dst();
+
+ /// last parsed group should be reinterpreted as a decimal value -
it's the first octet of IPv4
+ --groups;
+ iter -= 2;
+
+ UInt16 num = 0;
+ for (int i = 0; i < 2; ++i) {
+ unsigned char first = (iter[i] >> 4) & 0x0fu;
+ unsigned char second = iter[i] & 0x0fu;
+ if (first > 9 || second > 9) return clear_dst();
+ (num *= 100) += first * 10 + second;
+ }
+ if (num > 255) return clear_dst();
+
+ /// parse IPv4 with known first octet
+ if (!parseIPv4(src, eof, iter, num)) return clear_dst();
+
+ if constexpr (std::endian::native == std::endian::little)
+ std::reverse(iter, iter + IPV4_BINARY_LENGTH);
+
+ iter += 4;
+ groups += 2;
+ break; /// IPv4 block is the last - end of parsing
+ }
+
+ if (!group_start) /// end of parsing
+ break;
+ group_start = false;
+
+ UInt16 val = 0; /// current decoded group
+ int xdigits = 0; /// number of decoded hex digits in current group
+
+ for (; !eof() && xdigits < 4; ++src, ++xdigits) {
+ UInt8 num = unhex(*src);
+ if (num == 0xFF) break;
+ (val <<= 4) |= num;
+ }
+
+ if (xdigits == 0) /// end of parsing
+ break;
+
+ *iter++ = static_cast<unsigned char>((val >> 8) & 0xffu);
+ *iter++ = static_cast<unsigned char>(val & 0xffu);
+ ++groups;
+ }
+
+ /// either all 8 groups or all-zeroes block should be present
+ if (groups < 8 && zptr == nullptr) return clear_dst();
+
+ if (zptr != nullptr) /// process all-zeroes block
+ {
+ size_t msize = iter - zptr;
+ std::memmove(dst + IPV6_BINARY_LENGTH - msize, zptr, msize);
+ std::memset(zptr, '\0', IPV6_BINARY_LENGTH - (iter - dst));
+ }
+
+ return true;
+}
+
+/// returns pointer to the right after parsed sequence or null on failed
parsing
+inline const char* parseIPv6(const char* src, const char* end, unsigned char*
dst) {
+ if (parseIPv6(
+ src, [&src, end]() { return src == end; }, dst))
+ return src;
+ return nullptr;
+}
+
+/// returns true if whole buffer was parsed successfully
+inline bool parseIPv6whole(const char* src, const char* end, unsigned char*
dst) {
+ return parseIPv6(src, end, dst) == end;
+}
+
+/// returns pointer to the right after parsed sequence or null on failed
parsing
+inline const char* parseIPv6(const char* src, unsigned char* dst) {
+ if (parseIPv6(
+ src, []() { return false; }, dst))
+ return src;
+ return nullptr;
+}
+
+/// returns true if whole null-terminated string was parsed successfully
+inline bool parseIPv6whole(const char* src, unsigned char* dst) {
+ const char* end = parseIPv6(src, dst);
+ return end != nullptr && *end == '\0';
+}
+
} // namespace doris::vectorized
diff --git a/be/src/vec/runtime/ipv4_value.h b/be/src/vec/runtime/ipv4_value.h
index fb42ed66f9c..9304aa2c18a 100644
--- a/be/src/vec/runtime/ipv4_value.h
+++ b/be/src/vec/runtime/ipv4_value.h
@@ -17,14 +17,13 @@
#pragma once
-#include <stdint.h>
-
#include <algorithm>
#include <regex>
#include <sstream>
#include <string>
#include "util/string_parser.hpp"
+#include "vec/common/format_ip.h"
namespace doris {
@@ -34,87 +33,41 @@ public:
explicit IPv4Value(vectorized::IPv4 ipv4) { _value = ipv4; }
- explicit IPv4Value(std::string ipv4) {}
-
- [[nodiscard]] const vectorized::IPv4& value() const { return _value; }
+ const vectorized::IPv4& value() const { return _value; }
vectorized::IPv4& value() { return _value; }
void set_value(vectorized::IPv4 ipv4) { _value = ipv4; }
- bool from_string(std::string ipv4) { return from_string(_value, ipv4); }
-
- [[nodiscard]] std::string to_string() const { return to_string(_value); }
+ bool from_string(const std::string& ipv4_str) { return from_string(_value,
ipv4_str); }
- static bool from_string(vectorized::IPv4& value, std::string ipv4) {
- remove_ipv4_space(ipv4);
+ std::string to_string() const { return to_string(_value); }
- // shortest ipv4 string is `0.0.0.0` whose length is 7
- if (ipv4.size() < 7 || !is_valid_string(ipv4)) {
+ static bool from_string(vectorized::IPv4& value, const std::string&
ipv4_str) {
+ if (ipv4_str.empty()) {
return false;
}
-
- vectorized::IPv4 octets[4] = {0};
- std::istringstream iss(ipv4);
- std::string octet;
- uint8_t octet_index = 0;
-
- while (getline(iss, octet, '.')) {
- if (octet_index >= 4) {
- return false;
- }
-
- StringParser::ParseResult result;
- vectorized::IPv4 val =
StringParser::string_to_unsigned_int<vectorized::IPv4>(
- octet.c_str(), octet.length(), &result);
- if (result != StringParser::PARSE_SUCCESS || val > 255) {
- return false;
- }
-
- octets[octet_index++] = val;
- }
-
- if (octet_index != 4) {
+ int64_t parse_value;
+ const char* src = ipv4_str.c_str();
+ const char* end = ipv4_str.c_str() + ipv4_str.size() - 1;
+ while (std::isspace(*src)) ++src;
+ while (std::isspace(*end)) --end;
+ if (!vectorized::parseIPv4whole(src, ++end,
+ reinterpret_cast<unsigned
char*>(&parse_value))) {
return false;
}
-
- value = (octets[0] << 24) | (octets[1] << 16) | (octets[2] << 8) |
octets[3];
+ value = static_cast<vectorized::IPv4>(parse_value);
return true;
}
static std::string to_string(vectorized::IPv4 value) {
- std::stringstream ss;
- ss << ((value >> 24) & 0xFF) << '.' << ((value >> 16) & 0xFF) << '.'
- << ((value >> 8) & 0xFF) << '.' << (value & 0xFF);
- return ss.str();
- }
-
- static void remove_ipv4_space(std::string& ipv4) {
- if (ipv4.empty()) {
- return;
- }
-
- std::string special_chars = "\r\n\t ";
-
- size_t pos = ipv4.find_first_not_of(special_chars);
- if (pos != std::string::npos) {
- ipv4.erase(0, pos);
- }
-
- pos = ipv4.find_last_not_of(special_chars);
- if (pos != std::string::npos) {
- ipv4.erase(pos + 1);
- }
- }
-
- static bool is_valid_string(std::string ipv4) {
- static std::regex IPV4_STD_REGEX(
-
"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-"
- "9]?)$");
- if (ipv4.size() > 15 || !std::regex_match(ipv4, IPV4_STD_REGEX)) {
- return false;
- }
- return true;
+ char buf[IPV4_MAX_TEXT_LENGTH + 1];
+ char* start = buf;
+ char* end = buf;
+ const auto* src = reinterpret_cast<const unsigned char*>(&value);
+ vectorized::formatIPv4(src, end);
+ size_t len = end - start;
+ return {buf, len};
}
private:
diff --git a/be/src/vec/runtime/ipv6_value.h b/be/src/vec/runtime/ipv6_value.h
index 8839ab4e2b0..8aaa8a26b69 100644
--- a/be/src/vec/runtime/ipv6_value.h
+++ b/be/src/vec/runtime/ipv6_value.h
@@ -17,12 +17,11 @@
#pragma once
-#include <stdint.h>
-
#include <regex>
#include <sstream>
#include <string>
+#include "vec/common/format_ip.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_number_base.h"
@@ -35,224 +34,35 @@ public:
explicit IPv6Value(vectorized::IPv6 ipv6) { _value = ipv6; }
- [[nodiscard]] const vectorized::IPv6& value() const { return _value; }
+ const vectorized::IPv6& value() const { return _value; }
vectorized::IPv6& value() { return _value; }
void set_value(vectorized::IPv6 ipv6) { _value = ipv6; }
- bool from_string(std::string ipv6) { return from_string(_value, ipv6); }
-
- bool from_binary_string(std::string ipv6_binary) {
- return from_binary_string(_value, ipv6_binary);
- }
-
- static bool from_string(vectorized::IPv6& x, std::string ipv6) {
- remove_ipv6_space(ipv6);
-
- if (ipv6.empty() || !is_valid_string(ipv6)) {
- return false;
- }
-
- std::transform(ipv6.begin(), ipv6.end(), ipv6.begin(),
- [](unsigned char ch) { return std::tolower(ch); });
- std::istringstream iss(ipv6);
- std::string field;
- uint16_t fields[8] = {0};
- uint8_t zero_index = 0;
- uint8_t num_field = 0;
- uint8_t right_field_num = 0;
-
- while (num_field < 8) {
- if (!getline(iss, field, ':')) {
- break;
- }
-
- if (field.empty()) {
- zero_index = num_field;
- fields[num_field++] = 0;
- } else {
- try {
- if (field.size() > 4 || field > "ffff") {
- return false;
- }
-
- fields[num_field++] = std::stoi(field, nullptr, 16);
- } catch (const std::exception& /*e*/) {
- return false;
- }
- }
- }
-
- if (zero_index != 0) {
- right_field_num = num_field - zero_index - 1;
-
- for (uint8_t i = 7; i > 7 - right_field_num; --i) {
- fields[i] = fields[zero_index + right_field_num + i - 7];
- fields[zero_index + right_field_num + i - 7] = 0;
- }
- }
-
- uint64_t high = (static_cast<uint64_t>(fields[0]) << 48) |
- (static_cast<uint64_t>(fields[1]) << 32) |
- (static_cast<uint64_t>(fields[2]) << 16) |
static_cast<uint64_t>(fields[3]);
- uint64_t low = (static_cast<uint64_t>(fields[4]) << 48) |
- (static_cast<uint64_t>(fields[5]) << 32) |
- (static_cast<uint64_t>(fields[6]) << 16) |
static_cast<uint64_t>(fields[7]);
-
- x = static_cast<vectorized::IPv6>(high) << 64 | low;
- return true;
- }
+ bool from_string(const std::string& ipv6_str) { return from_string(_value,
ipv6_str); }
- static bool from_binary_string(vectorized::IPv6& x, std::string
ipv6_binary_str) {
- // Accepts a FixedString(16) value containing the IPv6 address in
binary format
- if (ipv6_binary_str.size() != 16) {
+ static bool from_string(vectorized::IPv6& value, const std::string&
ipv6_str) {
+ if (ipv6_str.empty()) {
return false;
}
-
- uint64_t high = 0;
- uint64_t low = 0;
-
- const uint8_t* ipv6_binary = reinterpret_cast<const
uint8_t*>(ipv6_binary_str.c_str());
-
- for (int i = 0; i < 8; ++i) {
- high |= (static_cast<uint64_t>(ipv6_binary[i]) << (56 - i * 8));
- }
-
- for (int i = 8; i < 16; ++i) {
- low |= (static_cast<uint64_t>(ipv6_binary[i]) << (56 - (i - 8) *
8));
- }
-
- x = static_cast<vectorized::IPv6>(high) << 64 | low;
- return true;
- }
-
- [[nodiscard]] std::string to_string() const { return to_string(_value); }
-
- static std::string to_string(vectorized::IPv6 x) {
- // "0000:0000:0000:0000:0000:0000:0000:0000"
- if (x == 0) {
- return "::";
- }
-
- uint64_t low = static_cast<uint64_t>(x);
- uint64_t high = static_cast<uint64_t>(x >> 64);
-
- uint16_t fields[8] = {static_cast<uint16_t>((high >> 48) & 0xFFFF),
- static_cast<uint16_t>((high >> 32) & 0xFFFF),
- static_cast<uint16_t>((high >> 16) & 0xFFFF),
- static_cast<uint16_t>(high & 0xFFFF),
- static_cast<uint16_t>((low >> 48) & 0xFFFF),
- static_cast<uint16_t>((low >> 32) & 0xFFFF),
- static_cast<uint16_t>((low >> 16) & 0xFFFF),
- static_cast<uint16_t>(low & 0xFFFF)};
-
- uint8_t zero_start = 0, zero_end = 0;
-
- while (zero_start < 8 && zero_end < 8) {
- if (fields[zero_start] != 0) {
- zero_start++;
- zero_end = zero_start;
- continue;
- }
-
- while (zero_end < 7 && fields[zero_end + 1] == 0) {
- zero_end++;
- }
-
- if (zero_end > zero_start) {
- break;
- }
-
- zero_start++;
- zero_end = zero_start;
- }
-
- std::stringstream ss;
-
- if (zero_start == zero_end) {
- for (uint8_t i = 0; i < 7; ++i) {
- ss << std::hex << fields[i] << ":";
- }
- ss << std::hex << fields[7];
- } else {
- for (uint8_t i = 0; i < zero_start; ++i) {
- ss << std::hex << fields[i] << ":";
- }
-
- if (zero_end == 7) {
- ss << ":";
- } else {
- for (uint8_t j = zero_end + 1; j < 8; ++j) {
- ss << std::hex << ":" << fields[j];
- }
- }
- }
-
- return ss.str();
- }
-
- [[nodiscard]] std::string to_binary_string() const { return
to_binary_string(_value); }
-
- static std::string to_binary_string(vectorized::IPv6 x) {
- uint64_t low = static_cast<uint64_t>(x);
- uint64_t high = static_cast<uint64_t>(x >> 64);
-
- uint8_t fields[16] = {static_cast<uint8_t>((high >> 56) & 0xFF),
- static_cast<uint8_t>((high >> 48) & 0xFF),
- static_cast<uint8_t>((high >> 40) & 0xFF),
- static_cast<uint8_t>((high >> 32) & 0xFF),
- static_cast<uint8_t>((high >> 24) & 0xFF),
- static_cast<uint8_t>((high >> 16) & 0xFF),
- static_cast<uint8_t>((high >> 8) & 0xFF),
- static_cast<uint8_t>(high & 0xFF),
- static_cast<uint8_t>((low >> 56) & 0xFF),
- static_cast<uint8_t>((low >> 48) & 0xFF),
- static_cast<uint8_t>((low >> 40) & 0xFF),
- static_cast<uint8_t>((low >> 32) & 0xFF),
- static_cast<uint8_t>((low >> 24) & 0xFF),
- static_cast<uint8_t>((low >> 16) & 0xFF),
- static_cast<uint8_t>((low >> 8) & 0xFF),
- static_cast<uint8_t>(low & 0xFF)};
-
- std::stringstream ss;
-
- for (int i = 0; i < 16; ++i) {
- ss << (char)fields[i];
- }
-
- return ss.str();
+ const char* src = ipv6_str.c_str();
+ const char* end = ipv6_str.c_str() + ipv6_str.size() - 1;
+ while (std::isspace(*src)) ++src;
+ while (std::isspace(*end)) --end;
+ return vectorized::parseIPv6whole(src, ++end,
reinterpret_cast<unsigned char*>(&value));
}
- static void remove_ipv6_space(std::string& ipv6) {
- if (ipv6.empty()) {
- return;
- }
-
- std::string special_chars = "\r\n\t ";
+ std::string to_string() const { return to_string(_value); }
- size_t pos = ipv6.find_first_not_of(special_chars);
- if (pos != std::string::npos) {
- ipv6.erase(0, pos);
- }
-
- pos = ipv6.find_last_not_of(special_chars);
- if (pos != std::string::npos) {
- ipv6.erase(pos + 1);
- }
- }
-
- static bool is_valid_string(std::string ipv6) {
- static std::regex
IPV6_STD_REGEX("^([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$");
- static std::regex IPV6_COMPRESS_REGEX(
-
"^(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)::((([0-9A-Fa-f]{1,4}:)*[0-9A-Fa-f]{1,4}"
- ")?)$");
-
- if (ipv6.size() > 39 || !(std::regex_match(ipv6, IPV6_STD_REGEX) ||
- std::regex_match(ipv6,
IPV6_COMPRESS_REGEX))) {
- return false;
- }
- return true;
+ static std::string to_string(vectorized::IPv6 value) {
+ char buf[IPV6_MAX_TEXT_LENGTH + 1];
+ char* start = buf;
+ char* end = buf;
+ const auto* src = reinterpret_cast<const unsigned char*>(&value);
+ vectorized::formatIPv6(src, end);
+ size_t len = end - start;
+ return {buf, len};
}
private:
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]