dongjoon-hyun commented on a change in pull request #934: URL: https://github.com/apache/orc/pull/934#discussion_r727798871
########## File path: c++/test/TestReader.cc ########## @@ -101,4 +101,41 @@ namespace orc { 900, rowsInCurrentStripe, rowIndexStride, includedRowGroups)); } + void CheckFileWithSargs(const char* fileName, const char* softwareVersion) { + std::stringstream ss; + if(const char* example_dir = std::getenv("ORC_EXAMPLE_DIR")) { + ss << example_dir; + } else { + ss << "../../../examples"; + } + // Read a file with bloom filters written by CPP writer in version 1.6.11. + ss << "/" << fileName; + ReaderOptions readerOpts; + std::unique_ptr<Reader> reader = + createReader(readLocalFile(ss.str().c_str()), readerOpts); + EXPECT_EQ(WriterId::ORC_CPP_WRITER, reader->getWriterId()); + EXPECT_EQ(softwareVersion, reader->getSoftwareVersion()); + + // Create SearchArgument with a EQUALS predicate which can leverage the bloom filters. + RowReaderOptions rowReaderOpts; + std::unique_ptr<SearchArgumentBuilder> sarg = SearchArgumentFactory::newBuilder(); + // Integer value 18000000000 has an inconsistent hash before the fix of ORC-1024. + sarg->equals(1, PredicateDataType::LONG,Literal(static_cast<int64_t>(18000000000L))); + std::unique_ptr<SearchArgument> final_sarg = sarg->build(); + rowReaderOpts.searchArgument(std::move(final_sarg)); + std::unique_ptr<RowReader> rowReader = reader->createRowReader(rowReaderOpts); + + // Make sure bad bloom filters won't affect the results. + std::unique_ptr<ColumnVectorBatch> batch = + rowReader->createRowBatch(1024); + EXPECT_TRUE(rowReader->next(*batch)); + EXPECT_EQ(5, batch->numElements); + EXPECT_FALSE(rowReader->next(*batch)); + } + + TEST(TestRowReader, testSkipBadBloomFilters) { + CheckFileWithSargs("bad_bloom_filter_1.6.11.orc", "ORC C++ 1.6.11"); + CheckFileWithSargs("bad_bloom_filter_1.6.0.orc", "ORC C++"); + CheckFileWithSargs("bad_bloom_filter_1.6.11-SNAPSHOT.orc", "ORC C++ 1.6.11-SNAPSHOT"); Review comment: Shall we remove this because we have `bad_bloom_filter_1.6.11.orc`? ########## File path: c++/src/Reader.cc ########## @@ -35,6 +35,12 @@ #include <set> namespace orc { + // ORC files writen by these versions of cpp writers have inconsistent bloom filter + // hashing with the Java codes (ORC-1024). Bloom filters of them should not be used + // after we fix ORC-1024. Review comment: Let's remove this line. ########## File path: c++/src/Reader.cc ########## @@ -243,6 +249,34 @@ namespace orc { footer->rowindexstride(), getWriterVersionImpl(_contents.get()))); } + + hasBadBloomFilters = checkBadBloomFilters(); + } + + // Check if the file has inconsistent bloom filters (ORC-1024). Review comment: Shall we remove `(ORC-1024)` from this sentence? ########## File path: c++/test/TestReader.cc ########## @@ -101,4 +101,41 @@ namespace orc { 900, rowsInCurrentStripe, rowIndexStride, includedRowGroups)); } + void CheckFileWithSargs(const char* fileName, const char* softwareVersion) { + std::stringstream ss; + if(const char* example_dir = std::getenv("ORC_EXAMPLE_DIR")) { + ss << example_dir; + } else { + ss << "../../../examples"; + } + // Read a file with bloom filters written by CPP writer in version 1.6.11. + ss << "/" << fileName; + ReaderOptions readerOpts; + std::unique_ptr<Reader> reader = + createReader(readLocalFile(ss.str().c_str()), readerOpts); + EXPECT_EQ(WriterId::ORC_CPP_WRITER, reader->getWriterId()); + EXPECT_EQ(softwareVersion, reader->getSoftwareVersion()); + + // Create SearchArgument with a EQUALS predicate which can leverage the bloom filters. + RowReaderOptions rowReaderOpts; + std::unique_ptr<SearchArgumentBuilder> sarg = SearchArgumentFactory::newBuilder(); + // Integer value 18000000000 has an inconsistent hash before the fix of ORC-1024. + sarg->equals(1, PredicateDataType::LONG,Literal(static_cast<int64_t>(18000000000L))); + std::unique_ptr<SearchArgument> final_sarg = sarg->build(); + rowReaderOpts.searchArgument(std::move(final_sarg)); + std::unique_ptr<RowReader> rowReader = reader->createRowReader(rowReaderOpts); + + // Make sure bad bloom filters won't affect the results. + std::unique_ptr<ColumnVectorBatch> batch = + rowReader->createRowBatch(1024); + EXPECT_TRUE(rowReader->next(*batch)); + EXPECT_EQ(5, batch->numElements); + EXPECT_FALSE(rowReader->next(*batch)); + } + + TEST(TestRowReader, testSkipBadBloomFilters) { + CheckFileWithSargs("bad_bloom_filter_1.6.11.orc", "ORC C++ 1.6.11"); + CheckFileWithSargs("bad_bloom_filter_1.6.0.orc", "ORC C++"); + CheckFileWithSargs("bad_bloom_filter_1.6.11-SNAPSHOT.orc", "ORC C++ 1.6.11-SNAPSHOT"); Review comment: If this is for the `SNAPSHOT` postfix version check test coverage, `1.6.12-SNAPSHOT` is better. However, I'd like to recommend to write and read in a test instead of having `-SNAPSHOT` static files. ########## File path: c++/src/Reader.hh ########## @@ -179,6 +180,13 @@ namespace orc { */ void seekToRowGroup(uint32_t rowGroupEntryId); + /** + * Check if the file has bad bloom filters (ORC-1024). We will skip using them in the Review comment: Please remove `(ORC-1024)` from this sentence. ########## File path: c++/src/Reader.hh ########## @@ -179,6 +180,13 @@ namespace orc { */ void seekToRowGroup(uint32_t rowGroupEntryId); + /** + * Check if the file has bad bloom filters (ORC-1024). We will skip using them in the + * following reads. + * @return true if it has. + */ + bool checkBadBloomFilters(); Review comment: For naming, `hasBadBloomFilters` is better in `Reader.hh`. Although I understand that you use `checkXXX` here because you already have a member variable `checkBadBloomFilters`, we had better focus on the public interface more. ########## File path: c++/src/Reader.hh ########## @@ -179,6 +180,13 @@ namespace orc { */ void seekToRowGroup(uint32_t rowGroupEntryId); + /** + * Check if the file has bad bloom filters (ORC-1024). We will skip using them in the + * following reads. + * @return true if it has. + */ + bool checkBadBloomFilters(); Review comment: For naming, `hasBadBloomFilters` is better in `Reader.hh`. Although I understand that you use `checkXXX` here because you already have a member variable `checkBadBloomFilters`, we had better focus on the public interface more importantly. ########## File path: java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java ########## @@ -109,6 +109,11 @@ // identifies that follow columns bytes must be read private boolean needsFollowColumnsRead; private final boolean noSelectedVector; + // identifies whether the file has bad bloom filters that we should not use (ORC-1024). Review comment: Please remove `(ORC-1024)`. ########## File path: java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java ########## @@ -350,6 +356,33 @@ protected RecordReaderImpl(ReaderImpl fileReader, } } + /** + * Check if the file has inconsistent bloom filters (ORC-1024). We will skip using them Review comment: Please remove `(ORC-1024)`. ########## File path: java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java ########## @@ -350,6 +356,33 @@ protected RecordReaderImpl(ReaderImpl fileReader, } } + /** + * Check if the file has inconsistent bloom filters (ORC-1024). We will skip using them + * in the following reads. + * @return true if it has. + */ + private boolean checkBadBloomFilters(OrcProto.Footer footer) { Review comment: Thank you for adding this as `private`. ########## File path: java/core/src/java/org/apache/orc/util/BloomFilter.java ########## @@ -246,6 +246,13 @@ public void reset() { this.bitSet.clear(); } + /** + * Helper method that only used for tests. Use default visibility. + */ + boolean testBitSetPos(int pos) { Review comment: If this is used only once in `checkBitSet`, let's move this to a test class like `checkBitSet`. ########## File path: java/core/src/test/org/apache/orc/impl/TestReaderImpl.java ########## @@ -411,4 +415,35 @@ public void testGetRawDataSizeFromColIndices() throws Exception { ReaderImpl.getRawDataSizeFromColIndices(list, types, stats)); } } + + private void CheckFileWithSargs(String fileName, String softwareVersion) + throws IOException { + Configuration conf = new Configuration(); + Path path = new Path(workDir, fileName); + FileSystem fs = path.getFileSystem(conf); + try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path, + OrcFile.readerOptions(conf).filesystem(fs))) { + assertEquals(softwareVersion, reader.getSoftwareVersion()); + + Reader.Options opt = new Reader.Options(); + SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(conf); + builder.equals("id", PredicateLeaf.Type.LONG, 18000000000L); + opt.searchArgument(builder.build(), new String[]{"id"}); + + TypeDescription schema = reader.getSchema(); + VectorizedRowBatch batch = schema.createRowBatch(); + try (RecordReader rows = reader.rows(opt)) { + assertTrue(rows.nextBatch(batch), "No rows read out!"); + assertEquals(5, batch.size); + assertFalse(rows.nextBatch(batch)); + } + } + } + + @Test + public void testSkipBadBloomFilters() throws IOException { + CheckFileWithSargs("bad_bloom_filter_1.6.11.orc", "ORC C++ 1.6.11"); + CheckFileWithSargs("bad_bloom_filter_1.6.0.orc", "ORC C++ "); + CheckFileWithSargs("bad_bloom_filter_1.6.11-SNAPSHOT.orc", "ORC C++ 1.6.11-SNAPSHOT"); Review comment: ditto. Please make a file dynamically if you want to test `-SNAPSHOT` postfix. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@orc.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org