This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new 3aa17be9e4a [fix](unicode) fix 4 bytes unicode read and write bug
(#255)
3aa17be9e4a is described below
commit 3aa17be9e4a496e7e8ddf9e114e56addf8b536c2
Author: airborne12 <[email protected]>
AuthorDate: Thu Nov 28 11:08:06 2024 +0800
[fix](unicode) fix 4 bytes unicode read and write bug (#255)
* [fix](unicode) fix 4 bytes unicode read and write bug
---
src/core/CLucene/store/IndexInput.cpp | 22 ++++++------
src/core/CLucene/store/IndexOutput.cpp | 62 ++++++++++++++++++++++------------
2 files changed, 51 insertions(+), 33 deletions(-)
diff --git a/src/core/CLucene/store/IndexInput.cpp
b/src/core/CLucene/store/IndexInput.cpp
index 930b16392ae..82c5165e53c 100644
--- a/src/core/CLucene/store/IndexInput.cpp
+++ b/src/core/CLucene/store/IndexInput.cpp
@@ -135,23 +135,23 @@ CL_NS_USE(util)
for (int32_t i = start; i < end; ++i) {
b = readByte();
if ((b & 0x80) == 0) {
+ // 1-byte sequence: 0xxxxxxx
b = (b & 0x7F);
- } else if ((b & 0xE0) != 0xE0) {
- b = (((b & 0x1F) << 6)
- | (readByte() & 0x3F));
- } else {
- b = ((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6);
- b |= (readByte() & 0x3F);
+ } else if ((b & 0xE0) == 0xC0) {
+ // 2-byte sequence: 110xxxxx 10xxxxxx
+ b = (((b & 0x1F) << 6) | (readByte() & 0x3F));
+ } else if ((b & 0xF0) == 0xE0) {
+ // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+ b = ((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6) | (readByte() &
0x3F);
+ } else if ((b & 0xF8) == 0xF0) {
+ // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ b = ((b & 0x07) << 18) | ((readByte() & 0x3F) << 12) |
+ ((readByte() & 0x3F) << 6) | (readByte() & 0x3F);
}
buffer[i] = b;
}
}
-
-
-
-
-
BufferedIndexInput::BufferedIndexInput(int32_t _bufferSize):
buffer(NULL),
bufferSize(_bufferSize>=0?_bufferSize:CL_NS(store)::BufferedIndexOutput::BUFFER_SIZE),
diff --git a/src/core/CLucene/store/IndexOutput.cpp
b/src/core/CLucene/store/IndexOutput.cpp
index 77c37400d8e..f28ca07b3c3 100644
--- a/src/core/CLucene/store/IndexOutput.cpp
+++ b/src/core/CLucene/store/IndexOutput.cpp
@@ -165,16 +165,25 @@ CL_NS_DEF(store)
const int32_t end = length;
for (int32_t i = 0; i < end; ++i) {
- const int32_t code = (int32_t)s[i];
- if (code >= 0x01 && code <= 0x7F)
+ auto code = (uint32_t)s[i];
+ if (code >= 0x00 && code <= 0x7F) {
writeByte((uint8_t)code);
- else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
+ } else if (code <= 0x7FF) {
writeByte((uint8_t)(0xC0 | (code >> 6)));
writeByte((uint8_t)(0x80 | (code & 0x3F)));
- } else {
- writeByte((uint8_t)(0xE0 | (((uint32_t)code) >> 12)));
//unsigned shift
+ } else if (code <= 0xFFFF) {
+ writeByte((uint8_t)(0xE0 | (code >> 12)));
+ writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
+ writeByte((uint8_t)(0x80 | (code & 0x3F)));
+ } else if (code <= 0x10FFFF) {
+ writeByte((uint8_t)(0xF0 | (code >> 18)));
+ writeByte((uint8_t)(0x80 | ((code >> 12) & 0x3F)));
writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
writeByte((uint8_t)(0x80 | (code & 0x3F)));
+ } else {
+ writeByte(0xEF);
+ writeByte(0xBF);
+ writeByte(0xBD);
}
}
}
@@ -188,23 +197,32 @@ CL_NS_DEF(store)
}
void IndexOutput::writeChars(const TCHAR* s, const int32_t length){
- if ( length < 0 )
- _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a
positive value.");
-
- const int32_t end = length;
- for (int32_t i = 0; i < end; ++i) {
- const int32_t code = (int32_t)s[i];
- if (code >= 0x01 && code <= 0x7F)
- writeByte((uint8_t)code);
- else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
- writeByte((uint8_t)(0xC0 | (code >>
6)));
- writeByte((uint8_t)(0x80 | (code &
0x3F)));
- } else {
- writeByte((uint8_t)(0xE0 |
(((uint32_t)code) >> 12))); //unsigned shift
- writeByte((uint8_t)(0x80 | ((code >> 6)
& 0x3F)));
- writeByte((uint8_t)(0x80 | (code &
0x3F)));
- }
- }
+ if ( length < 0 )
+ _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be
a positive value.");
+
+ const int32_t end = length;
+ for (int32_t i = 0; i < end; ++i) {
+ auto code = (uint32_t)s[i];
+ if (code >= 0x00 && code <= 0x7F) {
+ writeByte((uint8_t)code);
+ } else if (code <= 0x7FF) {
+ writeByte((uint8_t)(0xC0 | (code >> 6)));
+ writeByte((uint8_t)(0x80 | (code & 0x3F)));
+ } else if (code <= 0xFFFF) {
+ writeByte((uint8_t)(0xE0 | (code >> 12)));
+ writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
+ writeByte((uint8_t)(0x80 | (code & 0x3F)));
+ } else if (code <= 0x10FFFF) {
+ writeByte((uint8_t)(0xF0 | (code >> 18)));
+ writeByte((uint8_t)(0x80 | ((code >> 12) & 0x3F)));
+ writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
+ writeByte((uint8_t)(0x80 | (code & 0x3F)));
+ } else {
+ writeByte(0xEF);
+ writeByte(0xBF);
+ writeByte(0xBD);
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]