This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new 25324632ba [Fix](analyzer) add ownership flag to Field's TokenStream
value and Analyzer's Reader (#222)
25324632ba is described below
commit 25324632babc0e5da28048ebbe9adcbdfc73c281
Author: airborne12 <[email protected]>
AuthorDate: Wed Jun 12 10:37:05 2024 +0800
[Fix](analyzer) add ownership flag to Field's TokenStream value and
Analyzer's Reader (#222)
---
src/core/CLucene/analysis/AnalysisHeader.h | 16 +++++++++++++---
src/core/CLucene/analysis/Analyzers.cpp | 4 +++-
src/core/CLucene/analysis/Analyzers.h | 5 +++--
src/core/CLucene/document/Field.cpp | 8 ++++++--
src/core/CLucene/document/Field.h | 3 ++-
5 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/src/core/CLucene/analysis/AnalysisHeader.h
b/src/core/CLucene/analysis/AnalysisHeader.h
index fe10e396bc..e1528d2f94 100644
--- a/src/core/CLucene/analysis/AnalysisHeader.h
+++ b/src/core/CLucene/analysis/AnalysisHeader.h
@@ -10,6 +10,7 @@
#include "CLucene/index/Payload.h"
#include "CLucene/util/VoidList.h"
#include "CLucene/LuceneThreads.h"
+#include "CLucene/util/CLStreams.h"
#include <unordered_set>
@@ -304,6 +305,10 @@ public:
_stopwords = stopwords;
}
+ virtual void set_ownReader(bool ownReader) {
+ _ownReader = ownReader;
+ }
+
private:
DEFINE_MUTEX(THIS_LOCK)
@@ -322,6 +327,7 @@ protected:
virtual void setPreviousTokenStream(TokenStream* obj);
bool _lowercase = false;
+ bool _ownReader = false;
std::unordered_set<std::string_view>* _stopwords = nullptr;
public:
@@ -359,19 +365,23 @@ protected:
/** The text source for this Tokenizer. */
CL_NS(util)::Reader* input;
bool lowercase = false;
+ bool ownReader = false;
std::unordered_set<std::string_view>* stopwords = nullptr;
public:
/** Construct a tokenizer with null input. */
Tokenizer():input(nullptr){}
/** Construct a token stream processing the given input. */
- explicit Tokenizer(CL_NS(util)::Reader* _input):input(_input){}
+ explicit Tokenizer(CL_NS(util)::Reader* _input, bool _ownReader =
false):input(_input), ownReader(_ownReader){}
/** By default, closes the input Reader. */
virtual void close() {
if (input != NULL) {
- // ? delete input;
- input = NULL;
+ if (ownReader) {
+ _CLDELETE(input);
+ } else {
+ input = NULL;
+ }
}
};
diff --git a/src/core/CLucene/analysis/Analyzers.cpp
b/src/core/CLucene/analysis/Analyzers.cpp
index 05a1c9e6f8..fde703e44a 100644
--- a/src/core/CLucene/analysis/Analyzers.cpp
+++ b/src/core/CLucene/analysis/Analyzers.cpp
@@ -51,11 +51,13 @@ template class LowerCaseTokenizer<TCHAR>;
template<typename T>
SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in) :
LowerCaseTokenizer<T>(in) {
Tokenizer::lowercase = true;
+ Tokenizer::ownReader = false;
}
template<typename T>
-SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in, bool lowercase) :
LowerCaseTokenizer<T>(in) {
+SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in, bool lowercase,
bool ownReader) : LowerCaseTokenizer<T>(in) {
Tokenizer::lowercase = lowercase;
+ Tokenizer::ownReader = ownReader;
}
template<typename T>
diff --git a/src/core/CLucene/analysis/Analyzers.h
b/src/core/CLucene/analysis/Analyzers.h
index a06263cfcf..22231ef5e4 100644
--- a/src/core/CLucene/analysis/Analyzers.h
+++ b/src/core/CLucene/analysis/Analyzers.h
@@ -140,7 +140,7 @@ class CLUCENE_EXPORT SimpleTokenizer:public
LowerCaseTokenizer<T> {
public:
/** Construct a new SimpleTokenizer. */
explicit SimpleTokenizer(CL_NS(util)::Reader* in);
- SimpleTokenizer(CL_NS(util)::Reader* in, bool lowercase);
+ SimpleTokenizer(CL_NS(util)::Reader* in, bool lowercase, bool ownReader =
false);
virtual ~SimpleTokenizer();
Token* next(Token* token) override {
@@ -182,12 +182,13 @@ class CLUCENE_EXPORT SimpleAnalyzer: public Analyzer {
public:
SimpleAnalyzer(){
_lowercase = true;
+ _ownReader = false;
}
bool isSDocOpt() override { return true; }
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader*
reader) override{
- return _CLNEW SimpleTokenizer<T>(reader, _lowercase);
+ return _CLNEW SimpleTokenizer<T>(reader, _lowercase, _ownReader);
}
TokenStream* reusableTokenStream(const TCHAR* fieldName,
CL_NS(util)::Reader* reader) override{
if (tokenizer_ == nullptr) {
diff --git a/src/core/CLucene/document/Field.cpp
b/src/core/CLucene/document/Field.cpp
index 13bdf54d3d..5ec73be7e0 100644
--- a/src/core/CLucene/document/Field.cpp
+++ b/src/core/CLucene/document/Field.cpp
@@ -176,7 +176,8 @@ void Field::setValue(ValueArray<uint8_t>* value) {
}
/** Expert: change the value of this field. See <a
href="#setValue(java.lang.String)">setValue(String)</a>. */
-void Field::setValue(CL_NS(analysis)::TokenStream* value) {
+void Field::setValue(CL_NS(analysis)::TokenStream* value, bool own_stream) {
+ ownStream = own_stream;
_resetValue();
fieldsData = value;
valueType = VALUE_TOKENSTREAM;
@@ -340,7 +341,10 @@ void Field::_resetValue() {
} else if (valueType & VALUE_BINARY) {
ValueArray<uint8_t>* v =
static_cast<ValueArray<uint8_t>*>(fieldsData);
_CLDELETE(v);
- }
+ } else if (valueType & VALUE_TOKENSTREAM && ownStream) {
+ auto* v = static_cast<CL_NS(analysis)::TokenStream*>(fieldsData);
+ _CLDELETE(v);
+ }
valueType=VALUE_NONE;
}
const char* Field::getObjectName() const{
diff --git a/src/core/CLucene/document/Field.h
b/src/core/CLucene/document/Field.h
index 23c0ad17f5..eac8043999 100644
--- a/src/core/CLucene/document/Field.h
+++ b/src/core/CLucene/document/Field.h
@@ -305,7 +305,7 @@ public:
void setValue(CL_NS(util)::ValueArray<uint8_t>* value) ;
/** Expert: change the value of this field. See <a
href="#setValue(TCHAR*)">setValue(TCHAR*)</a>. */
- void setValue(CL_NS(analysis)::TokenStream* value);
+ void setValue(CL_NS(analysis)::TokenStream* value, bool own_stream =
false);
//void setValue(CL_NS(analysis)::STokenStream* value);
@@ -334,6 +334,7 @@ protected:
float_t boost;
IndexVersion indexVersion_ = IndexVersion::kV1;
+ bool ownStream = false;
};
CL_NS_END
#endif
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]