This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new 3236e18d93b [opt](inverted index) Refactor ICU tokenizer code location
for better organization and maintainability. (#283)
3236e18d93b is described below
commit 3236e18d93bf96481493d88c34b6c2515f3b0b75
Author: zzzxl <[email protected]>
AuthorDate: Thu Feb 20 10:28:45 2025 +0800
[opt](inverted index) Refactor ICU tokenizer code location for better
organization and maintainability. (#283)
---
CMakeLists.txt | 3 +-
.../analysis/icu/DefaultICUTokenizerConfig.cpp | 42 +++++++++++++++-------
src/core/CMakeLists.txt | 7 +---
src/test/CMakeLists.txt | 8 -----
src/test/tests.cpp | 1 -
5 files changed, 31 insertions(+), 30 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e39dc56344..317629af74d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,8 +197,7 @@ find_package(Roaring REQUIRED)
#zstd
find_package(Zstd REQUIRED)
-#icu
-find_package(ICU REQUIRED)
+
#sse2neon
INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/3rdparty/sse2neon)
diff --git a/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp
b/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp
index b43536f033a..57e8374d804 100644
--- a/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp
+++ b/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp
@@ -5,6 +5,7 @@
#include <mutex>
#include <sstream>
#include <string>
+#include <atomic>
namespace lucene::analysis {
@@ -18,20 +19,35 @@ DefaultICUTokenizerConfig::DefaultICUTokenizerConfig(bool
cjkAsWords, bool myanm
}
void DefaultICUTokenizerConfig::initialize(const std::string& dictPath) {
- static std::once_flag once_flag;
- std::call_once(once_flag, [&dictPath]() {
- UErrorCode status = U_ZERO_ERROR;
- cjkBreakIterator_.reset(
- icu::BreakIterator::createWordInstance(icu::Locale::getRoot(),
status));
- if (U_FAILURE(status)) {
- std::string error_msg = "Failed to create CJK BreakIterator: ";
- error_msg += u_errorName(status);
- _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str());
- }
+ static std::atomic<bool> initialized_(false);
+ if (!initialized_) {
+ static std::mutex mutex;
+ std::lock_guard<std::mutex> lock(mutex);
+
+ if (!initialized_) {
+ try {
+ UErrorCode status = U_ZERO_ERROR;
+ cjkBreakIterator_.reset(
+
icu::BreakIterator::createWordInstance(icu::Locale::getRoot(), status));
+ if (U_FAILURE(status)) {
+ std::string error_msg = "Failed to create CJK
BreakIterator: ";
+ error_msg += u_errorName(status);
+ _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str());
+ }
+
+ readBreakIterator(defaultBreakIterator_, dictPath +
"/uax29/Default.txt");
+ readBreakIterator(myanmarSyllableIterator_,
+ dictPath + "/uax29/MyanmarSyllable.txt");
- readBreakIterator(defaultBreakIterator_, dictPath +
"/uax29/Default.txt");
- readBreakIterator(myanmarSyllableIterator_, dictPath +
"/uax29/MyanmarSyllable.txt");
- });
+ initialized_ = true;
+ } catch (...) {
+ cjkBreakIterator_.reset();
+ defaultBreakIterator_.reset();
+ myanmarSyllableIterator_.reset();
+ throw; // Clean up resources and rethrow the original
exception to the caller
+ }
+ }
+ }
}
icu::BreakIterator* DefaultICUTokenizerConfig::getBreakIterator(int32_t
script) {
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 7fd9cabc60b..0a19a2d278f 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -71,11 +71,6 @@ SET(clucene_core_Files
./CLucene/analysis/standard/StandardFilter.cpp
./CLucene/analysis/standard/StandardTokenizer.cpp
./CLucene/analysis/standard95/StandardTokenizerImpl.cpp
- ./CLucene/analysis/icu/BreakIteratorWrapper.cpp
- ./CLucene/analysis/icu/CompositeBreakIterator.cpp
- ./CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp
- ./CLucene/analysis/icu/ICUTokenizer.cpp
- ./CLucene/analysis/icu/ScriptIterator.cpp
./CLucene/analysis/Analyzers.cpp
./CLucene/analysis/AnalysisHeader.cpp
./CLucene/store/MMapInput.cpp
@@ -256,7 +251,7 @@ IF (BUILD_STATIC_LIBRARIES)
TARGET_LINK_LIBRARIES(clucene-core-static ssl crypto ${BRPC_LIB}
${GLOG_LIB} ${GFLAG_LIB} ${PROTOBUF_LIB})
ENDIF (USE_BTHREAD)
TARGET_INCLUDE_DIRECTORIES(clucene-core-static PUBLIC ${Roaring_INCLUDE_DIR})
- TARGET_LINK_LIBRARIES(clucene-core-static PRIVATE zstd icu)
+ TARGET_LINK_LIBRARIES(clucene-core-static PRIVATE zstd)
SET_TARGET_PROPERTIES(clucene-core-static PROPERTIES
VERSION ${CLUCENE_VERSION}
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index f069adc0b2d..c284c5b7e0b 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -38,7 +38,6 @@ SET(test_files ./tests.cpp
./analysis/TestAnalysis.cpp
./analysis/TestAnalyzers.cpp
./analysis/TestStandard95.cpp
- ./analysis/TestICU.cpp
./debug/TestError.cpp
./document/TestDateTools.cpp
./document/TestDocument.cpp
@@ -294,13 +293,6 @@ IF (BUILD_STATIC_LIBRARIES)
COMMAND ${CMAKE_COMMAND} -E copy_directory ${DICT_SOURCE_DIR}
${DICT_TARGET_DIR}
COMMENT "Copying ${DATA_SOURCE_DIR} to ${DATA_TARGET_DIR}/dict")
- SET(ICU_DICT_SOURCE_DIR
${CMAKE_SOURCE_DIR}/src/core/CLucene/analysis/icu/data)
- SET(ICU_DICT_TARGET_DIR "${EXECUTABLE_OUTPUT_PATH}/icu-dict")
-
- ADD_CUSTOM_COMMAND(TARGET cl_test POST_BUILD
- COMMAND ${CMAKE_COMMAND} -E copy_directory ${ICU_DICT_SOURCE_DIR}
${ICU_DICT_TARGET_DIR}
- COMMENT "Copying ${DATA_SOURCE_DIR} to ${ICU_DICT_TARGET_DIR}")
-
ENDIF (BUILD_STATIC_LIBRARIES)
############################
diff --git a/src/test/tests.cpp b/src/test/tests.cpp
index 1c9b91444ff..f4a3609e57f 100644
--- a/src/test/tests.cpp
+++ b/src/test/tests.cpp
@@ -23,7 +23,6 @@ unittest tests[] = {
{"IndexCompressV3", testIndexCompressV3},
{"ByteArrayDataInput", testByteArrayDataInputSuite},
{"GrowableByteArrayDataOutput", testGrowableByteArrayDataOutputSuite},
- {"testICU", testICU},
#ifdef TEST_CONTRIB_LIBS
{"chinese", testchinese},
#endif
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]