[
https://issues.apache.org/jira/browse/ORC-24?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14655308#comment-14655308
]
ASF GitHub Bot commented on ORC-24:
-----------------------------------
Github user asandryh commented on a diff in the pull request:
https://github.com/apache/orc/pull/6#discussion_r36298379
--- Diff: c++/test/TestColumnReader.cc ---
@@ -1029,6 +1029,65 @@ TEST(TestColumnReader,
testStringDirectShortBufferWithNulls) {
}
}
+TEST(TestColumnReader, testStringDirectNullAcrossWindow) {
+ MockStripeStreams streams;
+
+ // set getSelectedColumns()
+ std::vector<bool> selectedColumns(2, true);
+ EXPECT_CALL(streams, getSelectedColumns())
+ .WillRepeatedly(testing::Return(selectedColumns));
+
+ // set getEncoding
+ proto::ColumnEncoding directEncoding;
+ directEncoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+ EXPECT_CALL(streams, getEncoding(testing::_))
+ .WillRepeatedly(testing::Return(directEncoding));
+
+ // set getStream
+ EXPECT_CALL(streams, getStreamProxy(0, proto::Stream_Kind_PRESENT, true))
+ .WillRepeatedly(testing::Return(nullptr));
+
+ const unsigned char isNull[2] = {0xff, 0x7f};
+ EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_PRESENT, true))
+ .WillRepeatedly(testing::Return
+ (new SeekableArrayInputStream(isNull,
+ ARRAY_SIZE(isNull))));
+
+ const char blob[] = "abcdefg";
+ EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_DATA, true))
+ .WillRepeatedly(testing::Return(new SeekableArrayInputStream
+ (blob, ARRAY_SIZE(blob), 4)));
+
+ // [1] * 7
+ const unsigned char lenData[] = {0x04, 0x00, 0x01};
+ EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_LENGTH, true))
+ .WillRepeatedly(testing::Return(new SeekableArrayInputStream
+ (lenData, ARRAY_SIZE(lenData))));
+
+ // create the row type
+ std::unique_ptr<Type> rowType = createStructType();
+ rowType->addStructField(createPrimitiveType(STRING), "col0");
+ rowType->assignIds(0);
+
+ std::unique_ptr<ColumnReader> reader = buildReader(*rowType, streams);
+
+ StructVectorBatch batch(25, *getDefaultPool());
+ StringVectorBatch *strings = new StringVectorBatch(25,
*getDefaultPool());
+ batch.fields.push_back(strings);
+ strings->length[0] = 5;
--- End diff --
Probably worth adding a comment here that the first element in this batch
is null, so the length shouldn't matter (because this is exactly what this unit
tests confirms)
> C++ reader for direct string encodings occasionally skips bytes
> ----------------------------------------------------------------
>
> Key: ORC-24
> URL: https://issues.apache.org/jira/browse/ORC-24
> Project: Orc
> Issue Type: Bug
> Components: C++
> Reporter: Owen O'Malley
> Assignee: Owen O'Malley
> Fix For: 1.0.0
>
> Attachments: ORC-24.patch
>
>
> The ORC C++ direct string column reader can occasionally skip bytes in the
> blob stream.
> The necessary conditions are:
> * The column is a string column and is directly encoded.
> * The blob stream for the row batch crosses a compression block boundary.
> * There is a null value toward the end of the block boundary.
> * The value in the length value of the null value crosses the block boundary,
> but the length value of the following value does not.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)