guiyanakuang commented on code in PR #1469:
URL: https://github.com/apache/orc/pull/1469#discussion_r1166553517


##########
c++/test/TestColumnReader.cc:
##########
@@ -4207,6 +4207,101 @@ namespace orc {
     }
   }
 
+  // ORC-1408: Row batch not reset notNull data if strip doesn't have PRESENT 
stream.
+  TEST(TestColumnReader, testORC1408) {
+    // reuse same StructVectorBatch
+    LongVectorBatch* longBatch = new LongVectorBatch(1024, *getDefaultPool());
+    StructVectorBatch batch(1024, *getDefaultPool());
+    batch.fields.push_back(longBatch);
+
+    // create the row type
+    std::unique_ptr<Type> rowType = createStructType();
+    rowType->addStructField("myInt", createPrimitiveType(INT));
+
+    // read integer with nulls
+    {
+      MockStripeStreams streams;
+
+      // set getSelectedColumns()
+      std::vector<bool> selectedColumns(2, true);
+
+      EXPECT_CALL(streams, 
getSelectedColumns()).WillRepeatedly(testing::Return(selectedColumns));
+
+      // set getEncoding
+      proto::ColumnEncoding directEncoding;
+      directEncoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+      EXPECT_CALL(streams, 
getEncoding(testing::_)).WillRepeatedly(testing::Return(directEncoding));
+
+      // set getStream
+      EXPECT_CALL(streams, getStreamProxy(0, proto::Stream_Kind_PRESENT, true))
+          .WillRepeatedly(testing::Return(nullptr));
+      const unsigned char buffer1[] = {0x16, 0xf0};
+      EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_PRESENT, true))
+          .WillRepeatedly(
+              testing::Return(new SeekableArrayInputStream(buffer1, 
ARRAY_SIZE(buffer1))));
+      const unsigned char buffer2[] = {0x32, 0x01, 0x00};
+      EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_DATA, true))
+          .WillRepeatedly(
+              testing::Return(new SeekableArrayInputStream(buffer2, 
ARRAY_SIZE(buffer2))));
+
+      std::unique_ptr<ColumnReader> reader = buildReader(*rowType, streams);
+
+      reader->next(batch, 100, 0);
+      ASSERT_EQ(100, batch.numElements);
+      ASSERT_EQ(true, !batch.hasNulls);
+      ASSERT_EQ(100, longBatch->numElements);
+      ASSERT_EQ(true, longBatch->hasNulls);
+      long next = 0;
+      for (size_t i = 0; i < batch.numElements; ++i) {
+        if (i & 4) {
+          EXPECT_EQ(0, longBatch->notNull[i]);
+        } else {
+          EXPECT_EQ(1, longBatch->notNull[i]);
+          EXPECT_EQ(next++, longBatch->data[i]);
+        }
+      }
+    }
+
+    // read no-null integers without PRESENT stream.
+    {
+      MockStripeStreams streams;
+
+      // set getSelectedColumns()
+      std::vector<bool> selectedColumns(2, true);
+
+      EXPECT_CALL(streams, 
getSelectedColumns()).WillRepeatedly(testing::Return(selectedColumns));
+
+      // set getEncoding
+      proto::ColumnEncoding directEncoding;
+      directEncoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+      EXPECT_CALL(streams, getEncoding(testing::_))
+          .WillRepeatedly(testing::Return(directEncoding));
+
+      // set getStream
+      EXPECT_CALL(streams, getStreamProxy(0, proto::Stream_Kind_PRESENT, true))
+          .WillRepeatedly(testing::Return(nullptr));
+      EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_PRESENT, true))
+          .WillRepeatedly(testing::Return(nullptr));
+      const unsigned char buffer[] = {0x64, 0x01, 0x00};
+      EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_DATA, true))
+          .WillRepeatedly(
+              testing::Return(new SeekableArrayInputStream(buffer, 
ARRAY_SIZE(buffer))));
+
+      std::unique_ptr<ColumnReader> reader = buildReader(*rowType, streams);
+
+      reader->next(batch, 100, 0);
+      ASSERT_EQ(100, batch.numElements);
+      ASSERT_EQ(true, !batch.hasNulls);
+      ASSERT_EQ(100, longBatch->numElements);
+      ASSERT_EQ(true, !longBatch->hasNulls);
+      int next = 0;
+      for (size_t i = 0; i < batch.numElements; ++i) {
+        EXPECT_EQ(1, longBatch->notNull[i]);

Review Comment:
   I searched for cases in the Java ORC project where `ColumnVector.noNulls` 
and `ColumnVector.isNull` are used. Most of the cases follow this contract, but 
unfortunately, there are still some counterexamples.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to