This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new 4caf1086 [improvement](keyword) keyword type uses the SDocument
process (#97)
4caf1086 is described below
commit 4caf10866a7a35358d19e3831298c4a6b29d62a8
Author: zzzxl <[email protected]>
AuthorDate: Thu Jul 6 13:09:07 2023 +0800
[improvement](keyword) keyword type uses the SDocument process (#97)
---
src/core/CLucene/index/SDocumentWriter.cpp | 6 +--
src/core/CLucene/index/TermInfosWriter.cpp | 69 ++++++++++++++++++++++++------
src/core/CLucene/store/IndexOutput.cpp | 14 ++++++
src/core/CLucene/store/IndexOutput.h | 2 +
src/core/CLucene/util/stringUtil.h | 20 +++++++++
5 files changed, 96 insertions(+), 15 deletions(-)
diff --git a/src/core/CLucene/index/SDocumentWriter.cpp
b/src/core/CLucene/index/SDocumentWriter.cpp
index 3b22fdad..33da7a67 100644
--- a/src/core/CLucene/index/SDocumentWriter.cpp
+++ b/src/core/CLucene/index/SDocumentWriter.cpp
@@ -739,8 +739,8 @@ int32_t
SDocumentsWriter<T>::ThreadState::comparePostings(Posting *p1, Posting *
const T *pos1 = scharPool->buffers[p1->textStart >> CHAR_BLOCK_SHIFT] +
(p1->textStart & CHAR_BLOCK_MASK);
const T *pos2 = scharPool->buffers[p2->textStart >> CHAR_BLOCK_SHIFT] +
(p2->textStart & CHAR_BLOCK_MASK);
while (true) {
- const T c1 = *pos1++;
- const T c2 = *pos2++;
+ const auto c1 = static_cast<typename
std::make_unsigned<T>::type>(*pos1++);
+ const auto c2 = static_cast<typename
std::make_unsigned<T>::type>(*pos2++);
if (c1 < c2)
if (CLUCENE_END_OF_WORD == c2)
return 1;
@@ -753,8 +753,8 @@ int32_t
SDocumentsWriter<T>::ThreadState::comparePostings(Posting *p1, Posting *
return 1;
else if (CLUCENE_END_OF_WORD == c1)
return 0;
+ }
}
-}
template<typename T>
void SDocumentsWriter<T>::ThreadState::quickSort(Posting **postings, int32_t
lo, int32_t hi) {
diff --git a/src/core/CLucene/index/TermInfosWriter.cpp
b/src/core/CLucene/index/TermInfosWriter.cpp
index 6b9060ec..6a457407 100644
--- a/src/core/CLucene/index/TermInfosWriter.cpp
+++ b/src/core/CLucene/index/TermInfosWriter.cpp
@@ -15,6 +15,7 @@
#include "_FieldInfos.h"
#include "_TermInfosWriter.h"
#include <assert.h>
+#include <iostream>
CL_NS_USE(util)
CL_NS_USE(store)
@@ -176,20 +177,64 @@ void STermInfosWriter<T>::close() {
template <typename T>
void STermInfosWriter<T>::writeTerm(int32_t fieldNumber, const T *termText,
int32_t termTextLength) {
- int32_t start = 0;
- const int32_t limit = termTextLength < lastTermTextLength ? termTextLength
: lastTermTextLength;
- while (start < limit) {
- if (termText[start] != lastTermText.values[start])
- break;
- start++;
- }
+ if constexpr (std::is_same_v<T, char>) {
+ std::string_view utf8Str(termText, termTextLength);
+ int32_t utf8Length = 0;
+ {
+ size_t i = 0;
+ for (; i < utf8Str.size();) {
+ int32_t n = StringUtil::utf8_byte_count(utf8Str[i]);
+ i += n;
+ utf8Length++;
+ }
+ assert(i == utf8Str.size());
+ }
- int32_t length = termTextLength - start;
+ int32_t start = 0;
+ int32_t utf8Start = 0;
+ int32_t limit = termTextLength < lastTermTextLength ? termTextLength :
lastTermTextLength;
+ auto prefixCompare = [this, &utf8Str, &termText](int32_t& start,
int32_t& utf8Start, int32_t limit) {
+ while (start < limit) {
+ int32_t n = StringUtil::utf8_byte_count(utf8Str[start]);
+ for (int32_t j = 0; j < n; j++) {
+ int32_t cur = start + j;
+ if (termText[cur] != lastTermText.values[cur]) {
+ return;
+ }
+ }
+ start += n;
+ utf8Start++;
+ }
+ };
+
+ prefixCompare(start, utf8Start, limit);
+ assert(start <= termTextLength);
+ assert(utf8Start <= utf8Length);
+ int32_t length = termTextLength - start;
+ utf8Length -= utf8Start;
+
+ // std::cout << "term: " << utf8Str << ", utf8Start: " << utf8Start <<
", utf8Length: " << utf8Length << ", length: " << length << std::endl;
+
+ output->writeVInt(utf8Start);
+ output->writeVInt(utf8Length);
+ output->writeU8SChars(termText + start, length);
+ output->writeVInt(fieldNumber);
+ } else {
+ int32_t start = 0;
+ const int32_t limit = termTextLength < lastTermTextLength ?
termTextLength : lastTermTextLength;
+ while (start < limit) {
+ if (termText[start] != lastTermText.values[start])
+ break;
+ start++;
+ }
- output->writeVInt(start); // write shared prefix length
- output->writeVInt(length); // write delta length
- output->writeSChars(termText + start, length);// write delta chars
- output->writeVInt(fieldNumber); // write field num
+ int32_t length = termTextLength - start;
+
+ output->writeVInt(start); // write shared prefix
length
+ output->writeVInt(length); // write delta length
+ output->writeSChars(termText + start, length);// write delta chars
+ output->writeVInt(fieldNumber); // write field num
+ }
}
template class STermInfosWriter<char>;
diff --git a/src/core/CLucene/store/IndexOutput.cpp
b/src/core/CLucene/store/IndexOutput.cpp
index 92fd4d9c..1d44aff1 100644
--- a/src/core/CLucene/store/IndexOutput.cpp
+++ b/src/core/CLucene/store/IndexOutput.cpp
@@ -8,6 +8,7 @@
#include "IndexOutput.h"
#include "IndexInput.h"
#include "CLucene/util/Misc.h"
+#include "CLucene/util/stringUtil.h"
CL_NS_USE(util)
CL_NS_DEF(store)
@@ -185,6 +186,19 @@ CL_NS_DEF(store)
writeBytes((const uint8_t*)s, length);
}
+ void IndexOutput::writeU8SChars(const char* s, const int32_t length) {
+ if ( length < 0 )
+ _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a
positive value.");
+
+ for (int32_t i = 0; i < length;) {
+ auto* chars = (const uint8_t*)s + i;
+ int32_t n = StringUtil::utf8_byte_count(*chars);
+ assert(n >= 1 && n <= 4);
+ writeBytes(chars, (n > 2 ? 3 : n));
+ i += n;
+ }
+ }
+
void IndexOutput::writeChars(const TCHAR* s, const int32_t length){
if ( length < 0 )
_CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a
positive value.");
diff --git a/src/core/CLucene/store/IndexOutput.h
b/src/core/CLucene/store/IndexOutput.h
index 6b6ca321..e8eff025 100644
--- a/src/core/CLucene/store/IndexOutput.h
+++ b/src/core/CLucene/store/IndexOutput.h
@@ -83,6 +83,8 @@ public:
void writeChars(const TCHAR* s, const int32_t length);
template<typename T>
void writeSChars(const T* s, int32_t length);
+
+ void writeU8SChars(const char* s, const int32_t length);
/** Closes this stream to further operations. */
virtual void close() = 0;
diff --git a/src/core/CLucene/util/stringUtil.h
b/src/core/CLucene/util/stringUtil.h
index da0547d6..5a715445 100644
--- a/src/core/CLucene/util/stringUtil.h
+++ b/src/core/CLucene/util/stringUtil.h
@@ -203,6 +203,26 @@ public:
}
#endif
+
+ static inline int32_t utf8_byte_count(uint8_t c) {
+ static constexpr int32_t LUT[256] = {
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, -1, -1, -1, -1, -1, -1,
-1};
+ return LUT[c];
+ }
};
#endif//_lucene_util__stringutil_H
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]