wgtmac commented on code in PR #1469:
URL: https://github.com/apache/orc/pull/1469#discussion_r1166187841


##########
c++/src/ColumnReader.cc:
##########
@@ -100,13 +100,16 @@ namespace orc {
           return;
         }
       }
+      rowBatch.hasNulls = false;
     } else if (incomingMask) {
       // If we don't have a notNull stream, copy the incomingMask
       rowBatch.hasNulls = true;
       memcpy(rowBatch.notNull.data(), incomingMask, numValues);
       return;
+    } else {
+      memset(rowBatch.notNull.data(), 1, numValues);

Review Comment:
   Adding an extra `memset` here does not break the contract but introduces a 
little bit performance penalty for non-null values.



##########
c++/src/ColumnReader.cc:
##########
@@ -100,13 +100,16 @@ namespace orc {
           return;
         }
       }
+      rowBatch.hasNulls = false;
     } else if (incomingMask) {
       // If we don't have a notNull stream, copy the incomingMask
       rowBatch.hasNulls = true;
       memcpy(rowBatch.notNull.data(), incomingMask, numValues);
       return;
+    } else {
+      memset(rowBatch.notNull.data(), 1, numValues);

Review Comment:
   The contract of `ColumnVectorBatch` is that we should not depend on values 
from `ColumnVectorBatch.notNull` buffer if `hasNulls == false`. Probably we 
should update the site to explicitly state this: 
https://orc.apache.org/docs/core-cpp.html 



##########
c++/test/TestColumnReader.cc:
##########
@@ -4207,6 +4207,101 @@ namespace orc {
     }
   }
 
+  // ORC-1408: Row batch not reset notNull data if strip doesn't have PRESENT 
stream.
+  TEST(TestColumnReader, testORC1408) {
+    // reuse same StructVectorBatch
+    LongVectorBatch* longBatch = new LongVectorBatch(1024, *getDefaultPool());
+    StructVectorBatch batch(1024, *getDefaultPool());
+    batch.fields.push_back(longBatch);
+
+    // create the row type
+    std::unique_ptr<Type> rowType = createStructType();
+    rowType->addStructField("myInt", createPrimitiveType(INT));
+
+    // read integer with nulls
+    {
+      MockStripeStreams streams;
+
+      // set getSelectedColumns()
+      std::vector<bool> selectedColumns(2, true);
+
+      EXPECT_CALL(streams, 
getSelectedColumns()).WillRepeatedly(testing::Return(selectedColumns));
+
+      // set getEncoding
+      proto::ColumnEncoding directEncoding;
+      directEncoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+      EXPECT_CALL(streams, 
getEncoding(testing::_)).WillRepeatedly(testing::Return(directEncoding));
+
+      // set getStream
+      EXPECT_CALL(streams, getStreamProxy(0, proto::Stream_Kind_PRESENT, true))
+          .WillRepeatedly(testing::Return(nullptr));
+      const unsigned char buffer1[] = {0x16, 0xf0};
+      EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_PRESENT, true))
+          .WillRepeatedly(
+              testing::Return(new SeekableArrayInputStream(buffer1, 
ARRAY_SIZE(buffer1))));
+      const unsigned char buffer2[] = {0x32, 0x01, 0x00};
+      EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_DATA, true))
+          .WillRepeatedly(
+              testing::Return(new SeekableArrayInputStream(buffer2, 
ARRAY_SIZE(buffer2))));
+
+      std::unique_ptr<ColumnReader> reader = buildReader(*rowType, streams);
+
+      reader->next(batch, 100, 0);
+      ASSERT_EQ(100, batch.numElements);
+      ASSERT_EQ(true, !batch.hasNulls);
+      ASSERT_EQ(100, longBatch->numElements);
+      ASSERT_EQ(true, longBatch->hasNulls);
+      long next = 0;
+      for (size_t i = 0; i < batch.numElements; ++i) {
+        if (i & 4) {
+          EXPECT_EQ(0, longBatch->notNull[i]);
+        } else {
+          EXPECT_EQ(1, longBatch->notNull[i]);
+          EXPECT_EQ(next++, longBatch->data[i]);
+        }
+      }
+    }
+
+    // read no-null integers without PRESENT stream.
+    {
+      MockStripeStreams streams;
+
+      // set getSelectedColumns()
+      std::vector<bool> selectedColumns(2, true);
+
+      EXPECT_CALL(streams, 
getSelectedColumns()).WillRepeatedly(testing::Return(selectedColumns));
+
+      // set getEncoding
+      proto::ColumnEncoding directEncoding;
+      directEncoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+      EXPECT_CALL(streams, getEncoding(testing::_))
+          .WillRepeatedly(testing::Return(directEncoding));
+
+      // set getStream
+      EXPECT_CALL(streams, getStreamProxy(0, proto::Stream_Kind_PRESENT, true))
+          .WillRepeatedly(testing::Return(nullptr));
+      EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_PRESENT, true))
+          .WillRepeatedly(testing::Return(nullptr));
+      const unsigned char buffer[] = {0x64, 0x01, 0x00};
+      EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_DATA, true))
+          .WillRepeatedly(
+              testing::Return(new SeekableArrayInputStream(buffer, 
ARRAY_SIZE(buffer))));
+
+      std::unique_ptr<ColumnReader> reader = buildReader(*rowType, streams);
+
+      reader->next(batch, 100, 0);
+      ASSERT_EQ(100, batch.numElements);
+      ASSERT_EQ(true, !batch.hasNulls);
+      ASSERT_EQ(100, longBatch->numElements);
+      ASSERT_EQ(true, !longBatch->hasNulls);
+      int next = 0;
+      for (size_t i = 0; i < batch.numElements; ++i) {
+        EXPECT_EQ(1, longBatch->notNull[i]);

Review Comment:
   Here is related to what I mentioned above. We should not read 
`longBatch->notNull` if `longBatch->hasNulls` is false.
   
   Not sure if the java side follows the same contract. @dongjoon-hyun 
@guiyanakuang 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to