This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new 04ed43c3 [optimize](reader) optimize the tii, tis file structure (#146)
04ed43c3 is described below
commit 04ed43c3c70f2c976e95260b07f08b197e1b40ae
Author: zzzxl <[email protected]>
AuthorDate: Thu Dec 7 20:30:35 2023 +0800
[optimize](reader) optimize the tii, tis file structure (#146)
---
src/core/CLucene/index/SegmentTermEnum.cpp | 85 +++++++++++++++++++++---------
src/core/CLucene/index/TermInfosReader.cpp | 27 ++++++----
src/core/CLucene/index/TermInfosWriter.cpp | 15 ++++--
src/core/CLucene/index/_SegmentTermEnum.h | 15 ++++--
src/core/CLucene/index/_TermInfosWriter.h | 7 +--
src/core/CLucene/store/IndexInput.h | 4 +-
6 files changed, 102 insertions(+), 51 deletions(-)
diff --git a/src/core/CLucene/index/SegmentTermEnum.cpp
b/src/core/CLucene/index/SegmentTermEnum.cpp
index 1383451c..574d9396 100644
--- a/src/core/CLucene/index/SegmentTermEnum.cpp
+++ b/src/core/CLucene/index/SegmentTermEnum.cpp
@@ -17,7 +17,7 @@
CL_NS_USE(store)
CL_NS_DEF(index)
- SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const
bool isi):
+ SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const
bool isi, int32_t in_format):
fieldInfos(fis){
//Func - Constructor
//Pre - i holds a reference to an instance of IndexInput
@@ -40,8 +40,8 @@ CL_NS_DEF(index)
//Set isClone to false as the instance is not clone of another
instance
isClone = false;
+ int32_t firstInt = in_format == -4 ? in_format :
input->readInt();
- int32_t firstInt = input->readInt();
if (firstInt >= 0) {
// original-format file, without explicit format version number
format = 0;
@@ -62,30 +62,47 @@ CL_NS_DEF(index)
_CLTHROWT(CL_ERR_CorruptIndex,err);
}
- size = input->readLong(); // read the size
- if (size < 0) { // read the size at file
footer, if size < 0
- auto pos = input->getFilePointer();
- input->seek(input->length() - 8);
- size = input->readLong();
- input->seek(pos);
- }
-
- if(format == -1){
- if (!isIndex) {
- indexInterval = input->readInt();
- formatM1SkipInterval = input->readInt();
- }
- // switch off skipTo optimization for file format prior to 1.4rc2
in order to avoid a bug in
- // skipTo implementation of these versions
- skipInterval = LUCENE_INT32_MAX_SHOULDBE;
- }else{
- indexInterval = input->readInt();
- skipInterval = input->readInt();
- if ( format == -3 ) {
- // this new format introduces multi-level skipping
- maxSkipLevels = input->readInt();
- }
- }
+ if (format == -4) {
+ if (isIndex) {
+ size =
input->readLong();
+ if (size < 0) {
+ auto pos =
input->getFilePointer();
+
input->seek(input->length() - 16);
+ size =
input->readLong();
+ tisSize =
input->readLong();
+
input->seek(pos);
+ }
+
+ indexInterval =
input->readInt();
+ skipInterval =
input->readInt();
+ maxSkipLevels =
input->readInt();
+ }
+ } else {
+ size = input->readLong();
// read the size
+ if (size < 0) {
// read the size at file footer, if size < 0
+ auto pos =
input->getFilePointer();
+
input->seek(input->length() - 8);
+ size =
input->readLong();
+
input->seek(pos);
+ }
+
+ if(format == -1){
+ if (!isIndex) {
+
indexInterval = input->readInt();
+
formatM1SkipInterval = input->readInt();
+ }
+ // switch off
skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in
+ // skipTo
implementation of these versions
+ skipInterval =
LUCENE_INT32_MAX_SHOULDBE;
+ }else{
+ indexInterval =
input->readInt();
+ skipInterval =
input->readInt();
+ if ( format ==
-3 ) {
+ // this
new format introduces multi-level skipping
+
maxSkipLevels = input->readInt();
+ }
+ }
+ }
}
}
@@ -113,6 +130,7 @@ CL_NS_DEF(index)
bufferLength = clone.bufferLength;
prev = clone.prev==NULL?NULL:_CLNEW
Term(clone.prev->field(),clone.prev->text(),false);
size = clone.size;
+ tisSize = clone.tisSize;
format = clone.format;
indexInterval= clone.indexInterval;
@@ -156,6 +174,21 @@ CL_NS_DEF(index)
}
}
+ void SegmentTermEnum::initByTii(SegmentTermEnum* tii) {
+ if (format == -4) {
+ size = tii->tisSize;
+ indexInterval = tii->indexInterval;
+ skipInterval = tii->skipInterval;
+ maxSkipLevels = tii->maxSkipLevels;
+ size_t header = sizeof(format) +
+
sizeof(size) +
+
sizeof(indexInterval) +
+
sizeof(skipInterval) +
+
sizeof(maxSkipLevels);
+ input->seek(header);
+ }
+ }
+
const char* SegmentTermEnum::getObjectName() const{ return
getClassName(); }
const char* SegmentTermEnum::getClassName(){ return "SegmentTermEnum"; }
diff --git a/src/core/CLucene/index/TermInfosReader.cpp
b/src/core/CLucene/index/TermInfosReader.cpp
index b28bb7ee..7996d4d4 100644
--- a/src/core/CLucene/index/TermInfosReader.cpp
+++ b/src/core/CLucene/index/TermInfosReader.cpp
@@ -50,20 +50,25 @@ CL_NS_DEF(index)
indexIsRead = false;
try {
- //Create an SegmentTermEnum for storing all the terms read of
the segment
- origEnum = _CLNEW SegmentTermEnum( directory->openInput(
tisFile.c_str(), readBufferSize ), fieldInfos, false);
- _size = origEnum->size;
- totalIndexInterval = origEnum->indexInterval;
- indexEnum = _CLNEW SegmentTermEnum( directory->openInput(
tiiFile.c_str(), readBufferSize ), fieldInfos, true);
+ //Create an SegmentTermEnum for storing all the terms read of the segment
- //Check if enumerator points to a valid instance
- CND_CONDITION(origEnum != NULL, "No memory could be allocated
for orig enumerator");
- CND_CONDITION(indexEnum != NULL, "No memory could be
allocated for index enumerator");
+ // tii
+ auto tiiStream = directory->openInput( tiiFile.c_str(), readBufferSize );
+ indexEnum = _CLNEW SegmentTermEnum(tiiStream, fieldInfos, true, -1);
+ CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index
enumerator");
- //call ensureIndexIsRead to load data to memory right now
- ensureIndexIsRead();
+ // tis
+ auto tisStream = directory->openInput( tisFile.c_str(), readBufferSize );
+ origEnum = _CLNEW SegmentTermEnum(tisStream, fieldInfos, false,
indexEnum->getFormat());
+ origEnum->initByTii(indexEnum);
+ CND_CONDITION(origEnum != NULL, "No memory could be allocated for index
enumerator");
+ _size = origEnum->size;
+ totalIndexInterval = origEnum->indexInterval;
- success = true;
+ //call ensureIndexIsRead to load data to memory right now
+ ensureIndexIsRead();
+
+ success = true;
} _CLFINALLY({
// With lock-less commits, it's entirely possible (and
// fine) to hit a FileNotFound exception above. In
diff --git a/src/core/CLucene/index/TermInfosWriter.cpp
b/src/core/CLucene/index/TermInfosWriter.cpp
index 32b6a100..b6a45d7d 100644
--- a/src/core/CLucene/index/TermInfosWriter.cpp
+++ b/src/core/CLucene/index/TermInfosWriter.cpp
@@ -159,9 +159,18 @@ void STermInfosWriter<T>::add(int32_t fieldNumber, const T
*termText, int32_t te
template <typename T>
void STermInfosWriter<T>::close() {
if (output) {
- //write size at start
- //output->seek(4); // write size after format
- output->writeLong(size);// do not seek now, directly write size at
file footer
+ if (FORMAT == -4) {
+ output->writeLong(size);
+ if (!isIndex) {
+ other->tisSize = size;
+ } else {
+ output->writeLong(tisSize);
+ }
+ } else {
+ //write size at start
+ //output->seek(4); // write size after format
+ output->writeLong(size);// do not seek now, directly write size at
file footer
+ }
output->close();
_CLDELETE(output);
diff --git a/src/core/CLucene/index/_SegmentTermEnum.h
b/src/core/CLucene/index/_SegmentTermEnum.h
index a2559082..b5fa419d 100644
--- a/src/core/CLucene/index/_SegmentTermEnum.h
+++ b/src/core/CLucene/index/_SegmentTermEnum.h
@@ -34,13 +34,14 @@ private:
CL_NS(store)::IndexInput* input; ///The IndexInput that reads from
the Term Infos File
FieldInfos* fieldInfos; ///contains the Field Infos for the segment
- int64_t size; ///The size of the enumeration
+ int64_t size = 0; ///The size of the enumeration
+ int64_t tisSize = 0;
int64_t position; ///The position of the current (term)
in the enumeration
int64_t indexPointer;
Term* prev; ///The previous current
- int32_t indexInterval;
- int32_t skipInterval;
- int32_t maxSkipLevels;
+ int32_t indexInterval = 0;
+ int32_t skipInterval = 0;
+ int32_t maxSkipLevels = 0;
friend class TermInfosReader;
friend class SegmentTermDocs;
@@ -54,11 +55,13 @@ protected:
public:
///Constructor
- SegmentTermEnum(CL_NS(store)::IndexInput* i, FieldInfos* fis, const
bool isi );
+ SegmentTermEnum(CL_NS(store)::IndexInput* i, FieldInfos* fis, const
bool isi, int32_t in_format = -1);
///Destructor
~SegmentTermEnum();
+ void initByTii(SegmentTermEnum* tii);
+
/**
* Moves the current of the set to the next in the set
*/
@@ -117,6 +120,8 @@ public:
const char* getObjectName() const;
static const char* getClassName();
+ int32_t getFormat() { return format; }
+
private:
/**
* Reads the next term in the enumeration
diff --git a/src/core/CLucene/index/_TermInfosWriter.h
b/src/core/CLucene/index/_TermInfosWriter.h
index 2bd7713a..3acc2abe 100644
--- a/src/core/CLucene/index/_TermInfosWriter.h
+++ b/src/core/CLucene/index/_TermInfosWriter.h
@@ -24,7 +24,8 @@ private:
FieldInfos *fieldInfos;
CL_NS(store)::IndexOutput *output;
TermInfo *lastTi;
- int64_t size;
+ int64_t size = 0;
+ int64_t tisSize = 0;
int64_t lastIndexPointer;
bool isIndex;
@@ -44,7 +45,7 @@ private:
public:
int32_t maxSkipLevels;
- LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -3);
+ LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -4);
LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_TERMDOCS_SKIP_INTERVAL =
PFOR_BLOCK_SIZE);
int32_t indexInterval;// = 128
@@ -96,7 +97,7 @@ public:
int32_t maxSkipLevels;
/** The file format version, a negative number. */
- LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -3);
+ LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -4);
//Expert: The fraction of {@link TermDocs} entries stored in skip tables,
//used to accellerate {@link TermDocs#skipTo(int)}. Larger values result
in
diff --git a/src/core/CLucene/store/IndexInput.h
b/src/core/CLucene/store/IndexInput.h
index 4d5b24d8..e17f9eb3 100644
--- a/src/core/CLucene/store/IndexInput.h
+++ b/src/core/CLucene/store/IndexInput.h
@@ -132,9 +132,7 @@ CL_NS_DEF(store)
virtual const char* getObjectName() const = 0;
short readShort();
- virtual void setIdxFileCache(bool index) {
-
_CLTHROWA(CL_ERR_UnsupportedOperation,"UnsupportedOperationException:
IndexInput::setIdxFileCache");
- }
+ virtual void setIdxFileCache(bool index) {}
};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]