This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/master by this push:
new 183d49f ORC-773: [C++] BinaryColumnWriter updates the BloomFilter
183d49f is described below
commit 183d49f55c5e25c35507c4f3dedf3c17be9c1c4e
Author: noirello <[email protected]>
AuthorDate: Mon Mar 29 05:41:31 2021 +0200
ORC-773: [C++] BinaryColumnWriter updates the BloomFilter
This fixes #669
---
c++/src/ColumnWriter.cc | 3 +++
c++/test/TestWriter.cc | 21 ++++++++++++++++-----
2 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index a7e6485..5af4922 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -1683,6 +1683,9 @@ namespace orc {
if (!notNull || notNull[i]) {
directDataStream->write(data[i], unsignedLength);
+ if (enableBloomFilter) {
+ bloomFilter->addBytes(data[i], length[i]);
+ }
binStats->update(unsignedLength);
++count;
}
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index 67936de..3e63b91 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -1666,13 +1666,13 @@ namespace orc {
.setMemoryPool(getDefaultPool())
.setRowIndexStride(10000)
.setFileVersion(fileVersion)
- .setColumnsUseBloomFilter({1, 2});
+ .setColumnsUseBloomFilter({1, 2, 3});
// write 65535 rows of data
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
MemoryPool * pool = getDefaultPool();
std::unique_ptr<Type> type(Type::buildTypeFromString(
- "struct<c1:bigint,c2:string>"));
+ "struct<c1:bigint,c2:string,c3:binary>"));
char dataBuffer[327675]; // 300k
uint64_t offset = 0;
@@ -1683,6 +1683,7 @@ namespace orc {
StructVectorBatch& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
LongVectorBatch& longBatch =
dynamic_cast<LongVectorBatch&>(*structBatch.fields[0]);
StringVectorBatch& strBatch =
dynamic_cast<StringVectorBatch&>(*structBatch.fields[1]);
+ StringVectorBatch& binBatch =
dynamic_cast<StringVectorBatch&>(*structBatch.fields[2]);
for (uint64_t i = 0; i < rowCount; ++i) {
// each row group has a unique value
@@ -1697,12 +1698,18 @@ namespace orc {
strBatch.data[i] = dataBuffer + offset;
strBatch.length[i] = static_cast<int64_t>(os.str().size());
memcpy(dataBuffer + offset, os.str().c_str(), os.str().size());
+
+ // c3
+ binBatch.data[i] = dataBuffer + offset;
+ binBatch.length[i] = static_cast<int64_t>(os.str().size());
+ memcpy(dataBuffer + offset, os.str().c_str(), os.str().size());
offset += os.str().size();
}
structBatch.numElements = rowCount;
longBatch.numElements = rowCount;
strBatch.numElements = rowCount;
+ binBatch.numElements = rowCount;
writer->add(*batch);
writer->close();
@@ -1712,14 +1719,16 @@ namespace orc {
std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
EXPECT_EQ(rowCount, reader->getNumberOfRows());
- EXPECT_EQ(2, reader->getBloomFilters(0, {}).size());
+ EXPECT_EQ(3, reader->getBloomFilters(0, {}).size());
EXPECT_EQ(1, reader->getBloomFilters(0, {1}).size());
EXPECT_EQ(1, reader->getBloomFilters(0, {2}).size());
+ EXPECT_EQ(1, reader->getBloomFilters(0, {3}).size());
- std::map<uint32_t, BloomFilterIndex> bfs = reader->getBloomFilters(0, {1,
2});
- EXPECT_EQ(2, bfs.size());
+ std::map<uint32_t, BloomFilterIndex> bfs = reader->getBloomFilters(0, {1,
2, 3});
+ EXPECT_EQ(3, bfs.size());
EXPECT_EQ(7, bfs[1].entries.size());
EXPECT_EQ(7, bfs[2].entries.size());
+ EXPECT_EQ(7, bfs[3].entries.size());
// test bloomfilters
for (uint64_t rg = 0; rg <= rowCount / options.getRowIndexStride(); ++rg) {
@@ -1728,9 +1737,11 @@ namespace orc {
if (value == rg) {
EXPECT_TRUE(bfs[1].entries[rg]->testLong(static_cast<int64_t>(value)));
EXPECT_TRUE(bfs[2].entries[rg]->testBytes(str.c_str(),
static_cast<int64_t>(str.size())));
+ EXPECT_TRUE(bfs[3].entries[rg]->testBytes(str.c_str(),
static_cast<int64_t>(str.size())));
} else {
EXPECT_FALSE(bfs[1].entries[rg]->testLong(static_cast<int64_t>(value)));
EXPECT_FALSE(bfs[2].entries[rg]->testBytes(str.c_str(),
static_cast<int64_t>(str.size())));
+ EXPECT_FALSE(bfs[3].entries[rg]->testBytes(str.c_str(),
static_cast<int64_t>(str.size())));
}
}
}