xiaokang commented on code in PR #245:
URL: https://github.com/apache/doris-thirdparty/pull/245#discussion_r1827420549
##########
src/core/CLucene/index/FieldConfig.h:
##########
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <cstdint>
+
+enum class FlagBits : uint32_t {
+ DICT_COMPRESS = 1 << 0, // 00000000 00000000 00000000 00000001
+};
+
+static bool isFlagSet(uint32_t flags, FlagBits flag) {
+ return flags & static_cast<uint32_t>(flag);
Review Comment:
Why static_cast since flags is already uint32_t?
##########
src/core/CLucene/index/IndexWriter.h:
##########
@@ -291,7 +293,7 @@ class CLUCENE_EXPORT IndexWriter:LUCENE_BASE {
// IndexOutput to the new Prox File
std::vector<CL_NS(store)::IndexOutput*> proxOutputList;
std::vector<int64_t> proxPointers;
- std::vector<TermInfosWriter*> termInfosWriterList;
+ std::vector<STermInfosWriter<char>*> termInfosWriterList;
Review Comment:
Why change it?
##########
src/core/CLucene/index/FieldInfos.cpp:
##########
@@ -114,6 +118,17 @@ IndexVersion FieldInfos::getIndexVersion() {
return IndexVersion::kV0;
}
+uint32_t FieldInfos::getFlags() {
+ size_t numFields = byNumber.size();
+ for (int i = 0; i < numFields; i++) {
+ FieldInfo* fi = fieldInfo(i);
+ if (fi->getFlags() > 0) {
+ return fi->getFlags();
Review Comment:
Is it right to just return the first non-zero flag?
##########
src/core/CLucene/index/TermInfosWriter.cpp:
##########
@@ -122,19 +134,32 @@ void STermInfosWriter<T>::add(int32_t fieldNumber, const
T *termText, int32_t te
CND_PRECONDITION(ti->proxPointer >= lastTi->proxPointer, ("proxPointer out
of order (" + Misc::toString(ti->proxPointer) + " < " +
Misc::toString(lastTi->proxPointer) + ")").c_str());
if (!isIndex && size % indexInterval == 0) {
+ if (isDictCompress_) {
+ tisMemoryOutput_.writeCompressedTo(output);
+ }
//add an index term
other->add(lastFieldNumber, lastTermText.values, lastTermTextLength,
lastTi);// add an index term
}
- //write term
- writeTerm(fieldNumber, termText, termTextLength);
- // write doc freq
- output->writeVInt(ti->docFreq);
- //write pointers
- output->writeVLong(ti->freqPointer - lastTi->freqPointer);
- output->writeVLong(ti->proxPointer - lastTi->proxPointer);
- if (ti->docFreq >= skipInterval) {
- output->writeVInt(ti->skipOffset);
+ if (isDictCompress_ && !isIndex) {
Review Comment:
add comment for different branches.
##########
src/core/CLucene/index/FieldInfos.cpp:
##########
@@ -257,6 +274,9 @@ void FieldInfos::write(IndexOutput* output) const{
if (fi->getIndexVersion() > IndexVersion::kV0) {
output->writeVInt(static_cast<int32_t>(fi->getIndexVersion()));
}
+ if (fi->getIndexVersion() >= IndexVersion::kV3) {
Review Comment:
add test for index version compatibility.
##########
src/core/CLucene/index/IndexWriter.cpp:
##########
@@ -1879,7 +1884,8 @@ void IndexWriter::mergeTerms(bool hasProx, IndexVersion
indexVersion) {
TermInfo termInfo;
termInfo.set(dfs[i], freqPointer, proxPointer, (int32_t)
(skipPointer - freqPointer));
// Write a new TermInfo
- termInfosWriter->add(smallestTerm, &termInfo);
+ std::string cur_term =
lucene_wcstoutf8string(smallestTerm->text(), smallestTerm->textLength());
+ termInfosWriter->add(smallestTerm->field(), cur_term.data(),
cur_term.length(), &termInfo);
Review Comment:
What's the effect of the change here and why change it?
##########
src/core/CLucene/store/v2/GrowableByteArrayDataOutput.h:
##########
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+#include <zstd.h>
+#include <iostream>
+
+#include "CLucene.h"
+#include "CLucene/store/IndexOutput.h"
+
+namespace v2 {
Review Comment:
The namespace name v2 is too simple and not precise.
##########
src/core/CLucene/document/Field.h:
##########
@@ -315,6 +316,24 @@ class CLUCENE_EXPORT Field : public
CL_NS(util)::NamedObject{
void setIndexVersion(IndexVersion indexVersion) { indexVersion_ =
indexVersion; }
IndexVersion getIndexVersion() { return indexVersion_; }
+ void updateFlag(FlagBits flag) {
+ if (indexVersion_ < IndexVersion::kV3) {
+ return;
+ }
+ flags_ |= static_cast<uint32_t>(flag);
+ }
Review Comment:
use consistent indent char: tab
##########
src/core/CLucene/index/_SegmentTermEnum.h:
##########
@@ -67,6 +68,7 @@ class SegmentTermEnum:public TermEnum{
* Moves the current of the set to the next in the set
*/
bool next();
+ bool _next(CL_NS(store)::IndexInput* in);
Review Comment:
make it private.
##########
src/core/CLucene/index/IndexWriter.cpp:
##########
@@ -1879,7 +1884,8 @@ void IndexWriter::mergeTerms(bool hasProx, IndexVersion
indexVersion) {
TermInfo termInfo;
termInfo.set(dfs[i], freqPointer, proxPointer, (int32_t)
(skipPointer - freqPointer));
// Write a new TermInfo
- termInfosWriter->add(smallestTerm, &termInfo);
+ std::string cur_term =
lucene_wcstoutf8string(smallestTerm->text(), smallestTerm->textLength());
+ termInfosWriter->add(smallestTerm->field(), cur_term.data(),
cur_term.length(), &termInfo);
Review Comment:
I dig into your newly added function add() but can not find the difference.
##########
src/core/CLucene/index/IndexVersion.h:
##########
@@ -1,9 +1,10 @@
#pragma once
enum class IndexVersion {
- kV0 = 0,
- kV1 = 1,
- kV2 = 2,
+ kV0 = 0,
+ kV1 = 1,
+ kV2 = 2,
+ kV3 = 3,
Review Comment:
add comment for each version
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]