This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 413552cabbd [feature](function) upper lower support utf8 input
(#49763)
413552cabbd is described below
commit 413552cabbde5dbe2fe6633ec4d4ff910aa87986
Author: Mryange <[email protected]>
AuthorDate: Mon Apr 7 16:05:14 2025 +0800
[feature](function) upper lower support utf8 input (#49763)
https://github.com/apache/doris/pull/49231
---
be/cmake/thirdparty.cmake | 4 +
be/src/vec/functions/function_string.cpp | 62 ++-
be/test/vec/function/function_string_test.cpp | 18 +-
dist/LICENSE-dist.txt | 1 +
dist/licenses/LICENSE-icu.txt | 542 +++++++++++++++++++++
.../fold_constant_string_arithmatic.groovy | 12 +
thirdparty/CHANGELOG.md | 4 +
thirdparty/build-thirdparty.sh | 20 +
thirdparty/vars.sh | 7 +
9 files changed, 655 insertions(+), 15 deletions(-)
diff --git a/be/cmake/thirdparty.cmake b/be/cmake/thirdparty.cmake
index 19f2a00012a..67db4aa2733 100644
--- a/be/cmake/thirdparty.cmake
+++ b/be/cmake/thirdparty.cmake
@@ -171,3 +171,7 @@ endif()
if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR
"${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86_64")
add_thirdparty(deflate)
endif()
+
+add_thirdparty(icuuc LIB64)
+add_thirdparty(icui18n LIB64)
+add_thirdparty(icudata LIB64)
\ No newline at end of file
diff --git a/be/src/vec/functions/function_string.cpp
b/be/src/vec/functions/function_string.cpp
index dad2d33c1cb..4777f0266af 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -20,9 +20,12 @@
#include <ctype.h>
#include <math.h>
#include <re2/stringpiece.h>
+#include <unicode/unistr.h>
+#include <unicode/ustream.h>
#include <bitset>
#include <cstddef>
+#include <cstdint>
#include <string_view>
#include "common/status.h"
@@ -446,20 +449,59 @@ struct TransferImpl {
return Status::OK();
}
+ const bool is_ascii = simd::VStringFunctions::is_ascii({data.data(),
data.size()});
res_offsets.resize(offset_size);
- memcpy_small_allow_read_write_overflow15(
- res_offsets.data(), offsets.data(),
- offset_size * sizeof(ColumnString::Offsets::value_type));
-
- size_t data_length = data.size();
- res_data.resize(data_length);
- if constexpr (std::is_same_v<OpName, NameToUpper>) {
- simd::VStringFunctions::to_upper(data.data(), data_length,
res_data.data());
- } else if constexpr (std::is_same_v<OpName, NameToLower>) {
- simd::VStringFunctions::to_lower(data.data(), data_length,
res_data.data());
+ if (is_ascii) {
+ memcpy_small_allow_read_write_overflow15(
+ res_offsets.data(), offsets.data(),
+ offset_size * sizeof(ColumnString::Offsets::value_type));
+
+ size_t data_length = data.size();
+ res_data.resize(data_length);
+ if constexpr (std::is_same_v<OpName, NameToUpper>) {
+ simd::VStringFunctions::to_upper(data.data(), data_length,
res_data.data());
+ } else if constexpr (std::is_same_v<OpName, NameToLower>) {
+ simd::VStringFunctions::to_lower(data.data(), data_length,
res_data.data());
+ }
+ } else {
+ execute_utf8(data, offsets, res_data, res_offsets);
}
+
return Status::OK();
}
+
+ static void execute_utf8(const ColumnString::Chars& data, const
ColumnString::Offsets& offsets,
+ ColumnString::Chars& res_data,
ColumnString::Offsets& res_offsets) {
+ std::string result;
+ for (int64_t i = 0; i < offsets.size(); ++i) {
+ const char* begin = reinterpret_cast<const char*>(&data[offsets[i
- 1]]);
+ uint32_t size = offsets[i] - offsets[i - 1];
+
+ result.clear();
+ if constexpr (std::is_same_v<OpName, NameToUpper>) {
+ to_upper_utf8(begin, size, result);
+ } else if constexpr (std::is_same_v<OpName, NameToLower>) {
+ to_lower_utf8(begin, size, result);
+ }
+ StringOP::push_value_string(result, i, res_data, res_offsets);
+ }
+ }
+
+ static void to_upper_utf8(const char* data, uint32_t size, std::string&
result) {
+ icu::StringPiece sp;
+ sp.set(data, size);
+ icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(sp);
+ unicode_str.toUpper();
+ unicode_str.toUTF8String(result);
+ }
+
+ static void to_lower_utf8(const char* data, uint32_t size, std::string&
result) {
+ icu::StringPiece sp;
+ sp.set(data, size);
+ icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(sp);
+ unicode_str.toLower();
+ unicode_str.toUTF8String(result);
+ }
};
// Capitalize first letter
diff --git a/be/test/vec/function/function_string_test.cpp
b/be/test/vec/function/function_string_test.cpp
index 687def9d4a5..03d448b19ef 100644
--- a/be/test/vec/function/function_string_test.cpp
+++ b/be/test/vec/function/function_string_test.cpp
@@ -496,6 +496,13 @@ TEST(function_string_test, function_string_lower_test) {
{{std::string("123ABC_")}, std::string("123abc_")},
{{std::string("MYtestSTR")}, std::string("myteststr")},
{{std::string("")}, std::string("")},
+ {{std::string("ÀÇ")}, std::string("àç")},
+ {{std::string("ÀÇAC123")}, std::string("àçac123")},
+ {{std::string("İstanbul")}, std::string("i̇stanbul")},
+ {{std::string("KIZILAY")}, std::string("kizilay")},
+ {{std::string("GROSSE")}, std::string("grosse")},
+ {{std::string("Å")}, std::string("å")},
+ {{std::string("ΣΟΦΟΣ")}, std::string("σοφος")},
{{Null()}, Null()},
//bug{{std::string("ΔΟΚΙΜΑΣΤΙΚΌ ΚΕΊΜΕΝΟ")},
std::string("δοκιμαστικό κείμενο")},
};
@@ -536,12 +543,13 @@ TEST(function_string_test, function_string_upper_test) {
{{std::string("โพสต์ทดสอบ")}, std::string("โพสต์ทดสอบ")},
{{std::string("יידיש טעקסט")}, std::string("יידיש טעקסט")},
//bug{{std::string("Exámplè wïth âccents")},
std::string("EXÁMPLÈ WÏTH ÂCCENTS")},
- {{std::string("ⓔⓧⓐⓜⓟⓛⓔ ⓦⓘⓣⓗ ⓒⓘⓡⓒⓛⓔ ⓛⓔⓣⓣⓔⓡⓢ")},
- std::string("ⓔⓧⓐⓜⓟⓛⓔ ⓦⓘⓣⓗ ⓒⓘⓡⓒⓛⓔ ⓛⓔⓣⓣⓔⓡⓢ")},
- {{std::string("🅴🆇🅰🅼🅿🅻🅴 🆆🅸🆃🅷 🆂🆀🆄🅰🆁🅴 🅻🅴🆃🆃🅴🆁🆂")},
- std::string("🅴🆇🅰🅼🅿🅻🅴 🆆🅸🆃🅷 🆂🆀🆄🅰🆁🅴 🅻🅴🆃🆃🅴🆁🆂")},
+ {{std::string("àç")}, std::string("ÀÇ")},
+ {{std::string("straße")}, std::string("STRASSE")},
+ {{std::string("àçac123")}, std::string("ÀÇAC123")},
+ {{std::string("ffi")}, std::string("FFI")},
+ {{std::string("Dž")}, std::string("DŽ")},
+ {{std::string("Ångström")}, std::string("ÅNGSTRÖM")},
};
-
check_function_all_arg_comb<DataTypeString, true>(func_name,
input_types, data_set);
check_function_all_arg_comb<DataTypeString,
true>(std::string("ucase"), input_types,
data_set);
diff --git a/dist/LICENSE-dist.txt b/dist/LICENSE-dist.txt
index 25cfe636d88..512897abb8f 100644
--- a/dist/LICENSE-dist.txt
+++ b/dist/LICENSE-dist.txt
@@ -1536,3 +1536,4 @@ Other dependencies:
* xxhash: 0.8.1 -- licenses/LICENSE-xxhash.txt
* concurrentqueue: 1.0.3 -- licenses/LICENSE-concurrentqueue.txt
* FlameGraph -- licenses/LICENSE-CDDL-1.0.txt
+ * icu 75.1 -- licenses/LICENSE-icu.txt
diff --git a/dist/licenses/LICENSE-icu.txt b/dist/licenses/LICENSE-icu.txt
new file mode 100644
index 00000000000..56cf04a96b1
--- /dev/null
+++ b/dist/licenses/LICENSE-icu.txt
@@ -0,0 +1,542 @@
+UNICODE LICENSE V3
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright © 2016-2024 Unicode, Inc.
+
+NOTICE TO USER: Carefully read the following legal agreement. BY
+DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
+SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
+DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of data files and any associated documentation (the "Data Files") or
+software and any associated documentation (the "Software") to deal in the
+Data Files or Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, and/or sell
+copies of the Data Files or Software, and to permit persons to whom the
+Data Files or Software are furnished to do so, provided that either (a)
+this copyright and permission notice appear with all copies of the Data
+Files or Software, or (b) this copyright and permission notice appear in
+associated Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+THIRD PARTY RIGHTS.
+
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
+FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall
+not be used in advertising or otherwise to promote the sale, use or other
+dealings in these Data Files or Software without prior written
+authorization of the copyright holder.
+
+SPDX-License-Identifier: Unicode-3.0
+
+----------------------------------------------------------------------
+
+Third-Party Software Licenses
+
+This section contains third-party software notices and/or additional
+terms for licensed third-party software components included within ICU
+libraries.
+
+----------------------------------------------------------------------
+
+ICU License - ICU 1.8.1 to ICU 57.1
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2016 International Business Machines Corporation and others
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies of
+the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
+SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
+CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale, use
+or other dealings in this Software without prior written authorization
+of the copyright holder.
+
+All trademarks and registered trademarks mentioned herein are the
+property of their respective owners.
+
+----------------------------------------------------------------------
+
+Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
+
+ # The Google Chrome software developed by Google is licensed under
+ # the BSD license. Other software included in this distribution is
+ # provided under other licenses, as set forth below.
+ #
+ # The BSD License
+ # http://opensource.org/licenses/bsd-license.php
+ # Copyright (C) 2006-2008, Google Inc.
+ #
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted provided that the following conditions are met:
+ #
+ # Redistributions of source code must retain the above copyright notice,
+ # this list of conditions and the following disclaimer.
+ # Redistributions in binary form must reproduce the above
+ # copyright notice, this list of conditions and the following
+ # disclaimer in the documentation and/or other materials provided with
+ # the distribution.
+ # Neither the name of Google Inc. nor the names of its
+ # contributors may be used to endorse or promote products derived from
+ # this software without specific prior written permission.
+ #
+ #
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #
+ #
+ # The word list in cjdict.txt are generated by combining three word lists
+ # listed below with further processing for compound word breaking. The
+ # frequency is generated with an iterative training against Google web
+ # corpora.
+ #
+ # * Libtabe (Chinese)
+ # - https://sourceforge.net/project/?group_id=1519
+ # - Its license terms and conditions are shown below.
+ #
+ # * IPADIC (Japanese)
+ # - http://chasen.aist-nara.ac.jp/chasen/distribution.html
+ # - Its license terms and conditions are shown below.
+ #
+ # ---------COPYING.libtabe ---- BEGIN--------------------
+ #
+ # /*
+ # * Copyright (c) 1999 TaBE Project.
+ # * Copyright (c) 1999 Pai-Hsiang Hsiao.
+ # * All rights reserved.
+ # *
+ # * Redistribution and use in source and binary forms, with or without
+ # * modification, are permitted provided that the following conditions
+ # * are met:
+ # *
+ # * . Redistributions of source code must retain the above copyright
+ # * notice, this list of conditions and the following disclaimer.
+ # * . Redistributions in binary form must reproduce the above copyright
+ # * notice, this list of conditions and the following disclaimer in
+ # * the documentation and/or other materials provided with the
+ # * distribution.
+ # * . Neither the name of the TaBE Project nor the names of its
+ # * contributors may be used to endorse or promote products derived
+ # * from this software without specific prior written permission.
+ # *
+ # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ # * OF THE POSSIBILITY OF SUCH DAMAGE.
+ # */
+ #
+ # /*
+ # * Copyright (c) 1999 Computer Systems and Communication Lab,
+ # * Institute of Information Science, Academia
+ # * Sinica. All rights reserved.
+ # *
+ # * Redistribution and use in source and binary forms, with or without
+ # * modification, are permitted provided that the following conditions
+ # * are met:
+ # *
+ # * . Redistributions of source code must retain the above copyright
+ # * notice, this list of conditions and the following disclaimer.
+ # * . Redistributions in binary form must reproduce the above copyright
+ # * notice, this list of conditions and the following disclaimer in
+ # * the documentation and/or other materials provided with the
+ # * distribution.
+ # * . Neither the name of the Computer Systems and Communication Lab
+ # * nor the names of its contributors may be used to endorse or
+ # * promote products derived from this software without specific
+ # * prior written permission.
+ # *
+ # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ # * OF THE POSSIBILITY OF SUCH DAMAGE.
+ # */
+ #
+ # Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
+ # University of Illinois
+ # [email protected] http://casper.beckman.uiuc.edu/~c-tsai4
+ #
+ # ---------------COPYING.libtabe-----END--------------------------------
+ #
+ #
+ # ---------------COPYING.ipadic-----BEGIN-------------------------------
+ #
+ # Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
+ # and Technology. All Rights Reserved.
+ #
+ # Use, reproduction, and distribution of this software is permitted.
+ # Any copy of this software, whether in its original form or modified,
+ # must include both the above copyright notice and the following
+ # paragraphs.
+ #
+ # Nara Institute of Science and Technology (NAIST),
+ # the copyright holders, disclaims all warranties with regard to this
+ # software, including all implied warranties of merchantability and
+ # fitness, in no event shall NAIST be liable for
+ # any special, indirect or consequential damages or any damages
+ # whatsoever resulting from loss of use, data or profits, whether in an
+ # action of contract, negligence or other tortuous action, arising out
+ # of or in connection with the use or performance of this software.
+ #
+ # A large portion of the dictionary entries
+ # originate from ICOT Free Software. The following conditions for ICOT
+ # Free Software applies to the current dictionary as well.
+ #
+ # Each User may also freely distribute the Program, whether in its
+ # original form or modified, to any third party or parties, PROVIDED
+ # that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+ # on, or be attached to, the Program, which is distributed substantially
+ # in the same form as set out herein and that such intended
+ # distribution, if actually made, will neither violate or otherwise
+ # contravene any of the laws and regulations of the countries having
+ # jurisdiction over the User or the intended distribution itself.
+ #
+ # NO WARRANTY
+ #
+ # The program was produced on an experimental basis in the course of the
+ # research and development conducted during the project and is provided
+ # to users as so produced on an experimental basis. Accordingly, the
+ # program is provided without any warranty whatsoever, whether express,
+ # implied, statutory or otherwise. The term "warranty" used herein
+ # includes, but is not limited to, any warranty of the quality,
+ # performance, merchantability and fitness for a particular purpose of
+ # the program and the nonexistence of any infringement or violation of
+ # any right of any third party.
+ #
+ # Each user of the program will agree and understand, and be deemed to
+ # have agreed and understood, that there is no warranty whatsoever for
+ # the program and, accordingly, the entire risk arising from or
+ # otherwise connected with the program is assumed by the user.
+ #
+ # Therefore, neither ICOT, the copyright holder, or any other
+ # organization that participated in or was otherwise related to the
+ # development of the program and their respective officials, directors,
+ # officers and other employees shall be held liable for any and all
+ # damages, including, without limitation, general, special, incidental
+ # and consequential damages, arising out of or otherwise in connection
+ # with the use or inability to use the program or any product, material
+ # or result produced or otherwise obtained by using the program,
+ # regardless of whether they have been advised of, or otherwise had
+ # knowledge of, the possibility of such damages at any time during the
+ # project or thereafter. Each user will be deemed to have agreed to the
+ # foregoing by his or her commencement of use of the program. The term
+ # "use" as used herein includes, but is not limited to, the use,
+ # modification, copying and distribution of the program and the
+ # production of secondary products from the program.
+ #
+ # In the case where the program, whether in its original form or
+ # modified, was distributed or delivered to or received by a user from
+ # any person, organization or entity other than ICOT, unless it makes or
+ # grants independently of ICOT any specific warranty to the user in
+ # writing, such person, organization or entity, will also be exempted
+ # from and not be held liable to the user for any such damages as noted
+ # above as far as the program is concerned.
+ #
+ # ---------------COPYING.ipadic-----END----------------------------------
+
+----------------------------------------------------------------------
+
+Lao Word Break Dictionary Data (laodict.txt)
+
+ # Copyright (C) 2016 and later: Unicode, Inc. and others.
+ # License & terms of use: http://www.unicode.org/copyright.html
+ # Copyright (c) 2015 International Business Machines Corporation
+ # and others. All Rights Reserved.
+ #
+ # Project: https://github.com/rober42539/lao-dictionary
+ # Dictionary: https://github.com/rober42539/lao-dictionary/laodict.txt
+ # License: https://github.com/rober42539/lao-dictionary/LICENSE.txt
+ # (copied below)
+ #
+ # This file is derived from the above dictionary version of Nov 22, 2020
+ # ----------------------------------------------------------------------
+ # Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted provided that the following conditions are met:
+ #
+ # Redistributions of source code must retain the above copyright notice, this
+ # list of conditions and the following disclaimer. Redistributions in binary
+ # form must reproduce the above copyright notice, this list of conditions and
+ # the following disclaimer in the documentation and/or other materials
+ # provided with the distribution.
+ #
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ # OF THE POSSIBILITY OF SUCH DAMAGE.
+ # --------------------------------------------------------------------------
+
+----------------------------------------------------------------------
+
+Burmese Word Break Dictionary Data (burmesedict.txt)
+
+ # Copyright (c) 2014 International Business Machines Corporation
+ # and others. All Rights Reserved.
+ #
+ # This list is part of a project hosted at:
+ # github.com/kanyawtech/myanmar-karen-word-lists
+ #
+ # --------------------------------------------------------------------------
+ # Copyright (c) 2013, LeRoy Benjamin Sharon
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted provided that the following conditions
+ # are met: Redistributions of source code must retain the above
+ # copyright notice, this list of conditions and the following
+ # disclaimer. Redistributions in binary form must reproduce the
+ # above copyright notice, this list of conditions and the following
+ # disclaimer in the documentation and/or other materials provided
+ # with the distribution.
+ #
+ # Neither the name Myanmar Karen Word Lists, nor the names of its
+ # contributors may be used to endorse or promote products derived
+ # from this software without specific prior written permission.
+ #
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ # THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ # SUCH DAMAGE.
+ # --------------------------------------------------------------------------
+
+----------------------------------------------------------------------
+
+Time Zone Database
+
+ ICU uses the public domain data and code derived from Time Zone
+Database for its time zone support. The ownership of the TZ database
+is explained in BCP 175: Procedure for Maintaining the Time Zone
+Database section 7.
+
+ # 7. Database Ownership
+ #
+ # The TZ database itself is not an IETF Contribution or an IETF
+ # document. Rather it is a pre-existing and regularly updated work
+ # that is in the public domain, and is intended to remain in the
+ # public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
+ # not apply to the TZ Database or contributions that individuals make
+ # to it. Should any claims be made and substantiated against the TZ
+ # Database, the organization that is providing the IANA
+ # Considerations defined in this RFC, under the memorandum of
+ # understanding with the IETF, currently ICANN, may act in accordance
+ # with all competent court orders. No ownership claims will be made
+ # by ICANN or the IETF Trust on the database or the code. Any person
+ # making a contribution to the database or code waives all rights to
+ # future claims in that contribution or in the TZ Database.
+
+----------------------------------------------------------------------
+
+Google double-conversion
+
+Copyright 2006-2011, the V8 project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+ * Neither the name of Google Inc. nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+----------------------------------------------------------------------
+
+JSON parsing library (nlohmann/json)
+
+File: vendor/json/upstream/single_include/nlohmann/json.hpp (only for ICU4C)
+
+MIT License
+
+Copyright (c) 2013-2022 Niels Lohmann
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+----------------------------------------------------------------------
+
+File: aclocal.m4 (only for ICU4C)
+Section: pkg.m4 - Macros to locate and utilise pkg-config.
+
+
+Copyright © 2004 Scott James Remnant <[email protected]>.
+Copyright © 2012-2015 Dan Nicholson <[email protected]>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+02111-1307, USA.
+
+As a special exception to the GNU General Public License, if you
+distribute this file as part of a program that contains a
+configuration script generated by Autoconf, you may include it under
+the same distribution terms that you use for the rest of that
+program.
+
+
+(The condition for the exception is fulfilled because
+ICU4C includes a configuration script generated by Autoconf,
+namely the `configure` script.)
+
+----------------------------------------------------------------------
+
+File: config.guess (only for ICU4C)
+
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, see <https://www.gnu.org/licenses/>.
+
+As a special exception to the GNU General Public License, if you
+distribute this file as part of a program that contains a
+configuration script generated by Autoconf, you may include it under
+the same distribution terms that you use for the rest of that
+program. This Exception is an additional permission under section 7
+of the GNU General Public License, version 3 ("GPLv3").
+
+
+(The condition for the exception is fulfilled because
+ICU4C includes a configuration script generated by Autoconf,
+namely the `configure` script.)
+
+----------------------------------------------------------------------
+
+File: install-sh (only for ICU4C)
+
+
+Copyright 1991 by the Massachusetts Institute of Technology
+
+Permission to use, copy, modify, distribute, and sell this software and its
+documentation for any purpose is hereby granted without fee, provided that
+the above copyright notice appear in all copies and that both that
+copyright notice and this permission notice appear in supporting
+documentation, and that the name of M.I.T. not be used in advertising or
+publicity pertaining to distribution of the software without specific,
+written prior permission. M.I.T. makes no representations about the
+suitability of this software for any purpose. It is provided "as is"
+without express or implied warranty.
\ No newline at end of file
diff --git
a/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy
b/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy
index 5693bf751d2..e2f382e0163 100644
---
a/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy
+++
b/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy
@@ -430,6 +430,13 @@ suite("fold_constant_string_arithmatic") {
testFoldConst("select lower(cast('AbC123' as string))")
testFoldConst("select lower(cast('Hello World' as string))")
testFoldConst("select lower('Hello World')")
+ testFoldConst("select lower('ÀÇ')")
+ testFoldConst("SELECT LOWER('İstanbul')")
+ testFoldConst("SELECT LOWER('KIZILAY')")
+ testFoldConst("SELECT LOWER('GROSSE')")
+ testFoldConst("SELECT LOWER('Dž')")
+ testFoldConst("SELECT LOWER('Å')")
+ testFoldConst("SELECT LOWER('ΣΟΦΟΣ')")
// lpad
testFoldConst("select lpad(cast('hi' as string), 1, cast('xy' as string))")
@@ -1293,6 +1300,11 @@ suite("fold_constant_string_arithmatic") {
testFoldConst("select unhex(NULL)")
testFoldConst("select upper(cast('Hello World' as string))")
testFoldConst("select upper('Hello World')")
+ testFoldConst("select upper('àç')")
+ testFoldConst("SELECT UPPER('ffi')")
+ testFoldConst("SELECT UPPER('straße')")
+ testFoldConst("SELECT UPPER('Dž')")
+ testFoldConst("SELECT UPPER('Ångström')")
// url_decode url_encode
testFoldConst("select
url_decode(cast('http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-2.0' as
string))")
diff --git a/thirdparty/CHANGELOG.md b/thirdparty/CHANGELOG.md
index ddb495ccd51..7e6c3dd1f4f 100644
--- a/thirdparty/CHANGELOG.md
+++ b/thirdparty/CHANGELOG.md
@@ -2,6 +2,10 @@
This file contains version of the third-party dependency libraries in the
build-env image. The docker build-env image is apache/doris, and the tag is
`build-env-${version}`
+## 20250402
+
+- Added: icu 75-1, develop ICU tokenizer based on ICU library.
+
## 20250225
- Modified: hadoop-libs 3.3.6.4 -> 3.3.6.5
diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh
index f019f5f1e26..d36e9fcd61d 100755
--- a/thirdparty/build-thirdparty.sh
+++ b/thirdparty/build-thirdparty.sh
@@ -1833,6 +1833,25 @@ build_dragonbox() {
"${BUILD_SYSTEM}" install
}
+# icu
+build_icu() {
+ check_if_source_exist "${ICU_SOURCE}"
+ cd "${TP_SOURCE_DIR}/${ICU_SOURCE}/icu4c/source"
+
+ rm -rf "${BUILD_DIR}"
+ mkdir -p "${BUILD_DIR}"
+ cd "${BUILD_DIR}"
+
+ ../configure --prefix="${TP_INSTALL_DIR}" \
+ --disable-shared \
+ --enable-static \
+ --disable-samples \
+ --disable-tests
+
+ make -j "${PARALLEL}"
+ make install
+}
+
if [[ "${#packages[@]}" -eq 0 ]]; then
packages=(
odbc
@@ -1902,6 +1921,7 @@ if [[ "${#packages[@]}" -eq 0 ]]; then
azure
dragonbox
brotli
+ icu
)
if [[ "$(uname -s)" == 'Darwin' ]]; then
read -r -a packages <<<"binutils gettext ${packages[*]}"
diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh
index cc26ecc85a3..02f04d7cbf0 100644
--- a/thirdparty/vars.sh
+++ b/thirdparty/vars.sh
@@ -520,6 +520,12 @@ DRAGONBOX_NAME=dragonbox-1.1.3.tar.gz
DRAGONBOX_SOURCE=dragonbox-1.1.3
DRAGONBOX_MD5SUM="889dc00db9612c6949a4ccf8115e0e6a"
+# icu
+ICU_DOWNLOAD="https://github.com/unicode-org/icu/archive/refs/tags/release-75-1.tar.gz"
+ICU_NAME=release-75-1.tar.gz
+ICU_SOURCE=icu-release-75-1
+ICU_MD5SUM="4003649b8731f938c852748ffa393847"
+
# all thirdparties which need to be downloaded is set in array TP_ARCHIVES
export TP_ARCHIVES=(
'LIBEVENT'
@@ -597,6 +603,7 @@ export TP_ARCHIVES=(
'BASE64'
'AZURE'
'DRAGONBOX'
+ 'ICU'
)
if [[ "$(uname -s)" == 'Darwin' ]]; then
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]