[ 
https://issues.apache.org/jira/browse/ORC-24?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14655308#comment-14655308
 ] 

ASF GitHub Bot commented on ORC-24:
-----------------------------------

Github user asandryh commented on a diff in the pull request:

    https://github.com/apache/orc/pull/6#discussion_r36298379
  
    --- Diff: c++/test/TestColumnReader.cc ---
    @@ -1029,6 +1029,65 @@ TEST(TestColumnReader, 
testStringDirectShortBufferWithNulls) {
       }
     }
     
    +TEST(TestColumnReader, testStringDirectNullAcrossWindow) {
    +  MockStripeStreams streams;
    +
    +  // set getSelectedColumns()
    +  std::vector<bool> selectedColumns(2, true);
    +  EXPECT_CALL(streams, getSelectedColumns())
    +      .WillRepeatedly(testing::Return(selectedColumns));
    +
    +  // set getEncoding
    +  proto::ColumnEncoding directEncoding;
    +  directEncoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
    +  EXPECT_CALL(streams, getEncoding(testing::_))
    +      .WillRepeatedly(testing::Return(directEncoding));
    +
    +  // set getStream
    +  EXPECT_CALL(streams, getStreamProxy(0, proto::Stream_Kind_PRESENT, true))
    +      .WillRepeatedly(testing::Return(nullptr));
    +
    +  const unsigned char isNull[2] = {0xff, 0x7f};
    +  EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_PRESENT, true))
    +    .WillRepeatedly(testing::Return
    +                    (new SeekableArrayInputStream(isNull,
    +                                                  ARRAY_SIZE(isNull))));
    +
    +  const char blob[] = "abcdefg";
    +  EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_DATA, true))
    +      .WillRepeatedly(testing::Return(new SeekableArrayInputStream
    +                                      (blob, ARRAY_SIZE(blob), 4)));
    +
    +  // [1] * 7
    +  const unsigned char lenData[] = {0x04, 0x00, 0x01};
    +  EXPECT_CALL(streams, getStreamProxy(1, proto::Stream_Kind_LENGTH, true))
    +      .WillRepeatedly(testing::Return(new SeekableArrayInputStream
    +                                      (lenData, ARRAY_SIZE(lenData))));
    +
    +  // create the row type
    +  std::unique_ptr<Type> rowType = createStructType();
    +  rowType->addStructField(createPrimitiveType(STRING), "col0");
    +  rowType->assignIds(0);
    +
    +  std::unique_ptr<ColumnReader> reader = buildReader(*rowType, streams);
    +
    +  StructVectorBatch batch(25, *getDefaultPool());
    +  StringVectorBatch *strings = new StringVectorBatch(25, 
*getDefaultPool());
    +  batch.fields.push_back(strings);
    +  strings->length[0] = 5;
    --- End diff --
    
    Probably worth adding a comment here that the first element in this batch 
is null, so the length shouldn't matter (because this is exactly what this unit 
tests confirms)


> C++ reader for direct string encodings occasionally skips bytes 
> ----------------------------------------------------------------
>
>                 Key: ORC-24
>                 URL: https://issues.apache.org/jira/browse/ORC-24
>             Project: Orc
>          Issue Type: Bug
>          Components: C++
>            Reporter: Owen O'Malley
>            Assignee: Owen O'Malley
>             Fix For: 1.0.0
>
>         Attachments: ORC-24.patch
>
>
> The ORC C++ direct string column reader can occasionally skip bytes in the 
> blob stream.
> The necessary conditions are:
> * The column is a string column and is directly encoded.
> * The blob stream for the row batch crosses a compression block boundary.
> * There is a null value toward the end of the block boundary.
> * The value in the length value of the null value crosses the block boundary, 
> but the length value of the following value does not.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to