[ 
https://issues.apache.org/jira/browse/PARQUET-676?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15456307#comment-15456307
 ] 

Wes McKinney commented on PARQUET-676:
--------------------------------------

It might be worth diffing this code with the current Impala version of this to 
see where there is some divergence (it would seem weird if this bug existed 
there and had never surfaced)

> MAX_VALUES_PER_LITERAL_RUN causes RLE encoding failure
> ------------------------------------------------------
>
>                 Key: PARQUET-676
>                 URL: https://issues.apache.org/jira/browse/PARQUET-676
>             Project: Parquet
>          Issue Type: Bug
>          Components: parquet-cpp
>         Environment: Mac OSX
>            Reporter: Mark Schaefer
>
> The following code works for NUM_TO_ENCODE <= 400, but fails greater than 
> that with the error:
> Check failed: (encoded) == (num_buffered_values_)
> It appears to have to do with how large of an RLE buffer is allocated for 
> buffering, causing Put to fail in levels.cc:78, but there doesn't seem to be 
> recovery from that, or any error indicating what the problem is. I'm assuming 
> MAX_VALUES_PER_LITERAL_RUN is somehow derived from the Parquet spec, but if 
> so, it seems that there ought to be an exception or something generated. This 
> could also be the basis of a writer example.
> // Licensed to the Apache Software Foundation (ASF) under one
> // or more contributor license agreements.  See the NOTICE file
> // distributed with this work for additional information
> // regarding copyright ownership.  The ASF licenses this file
> // to you under the Apache License, Version 2.0 (the
> // "License"); you may not use this file except in compliance
> // with the License.  You may obtain a copy of the License at
> //
> //   http://www.apache.org/licenses/LICENSE-2.0
> //
> // Unless required by applicable law or agreed to in writing,
> // software distributed under the License is distributed on an
> // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
> // KIND, either express or implied.  See the License for the
> // specific language governing permissions and limitations
> // under the License.
> #include <iostream>
> #include <memory>
> #include <list>
> #include <parquet/api/writer.h>
> using namespace parquet;
> int main(int argc, char** argv) {
>   if (argc != 2) {
>     std::cerr << "Usage: " << argv[0] << " <file>"
>               << std::endl;
>     return -1;
>   }
>   std::string filename = argv[1];
>   try {
>     const int NUM_TO_ENCODE = 400;
>     std::shared_ptr<OutputStream> ostream(new 
> LocalFileOutputStream(filename));
>     parquet::schema::NodeVector fields;
>     parquet::schema::NodePtr schema;
>     fields.push_back(parquet::schema::Int32("id", Repetition::REQUIRED));
>     fields.push_back(parquet::schema::ByteArray("name", 
> Repetition::OPTIONAL));
>     schema = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, 
> fields);
>     std::unique_ptr<ParquetFileWriter> writer = 
> ParquetFileWriter::Open(ostream, 
> std::dynamic_pointer_cast<parquet::schema::GroupNode>(schema));
>     RowGroupWriter* rgBlock = writer->AppendRowGroup(NUM_TO_ENCODE);
>     ColumnWriter* colBlock = rgBlock->NextColumn();
>     Int32Writer* intWriter = static_cast<Int32Writer*>(colBlock);
>     std::vector<int32_t> intbuf;
>     std::vector<int16_t> defbuf;
>     std::vector<ByteArray> strbuf;
>     for (int i = 0; i < NUM_TO_ENCODE; ++i) {
>         intbuf.push_back( i );
>         if (i % 10 == 0) {
>             defbuf.push_back(0);
>         } else {
>             defbuf.push_back(1);
>             uint8_t* buf = new uint8_t[4];
>             ByteArray ba;
>             sprintf((char*)buf,"%d",i);
>             ba.ptr = buf;
>             ba.len = strlen((const char*)ba.ptr);
>             strbuf.push_back(ba);
>         }
>     }
>     intWriter->WriteBatch(intbuf.size(), nullptr, nullptr, intbuf.data());
>     intWriter->Close();
>     colBlock = rgBlock->NextColumn();
>     ByteArrayWriter* strWriter = static_cast<ByteArrayWriter*>(colBlock);
>     std::cerr << "sizes: strings:" << strbuf.size() << " definitions: " << 
> defbuf.size() << std::endl;
>     strWriter->WriteBatch(defbuf.size(), defbuf.data(), nullptr, 
> strbuf.data());
>     strWriter->Close();
>   } catch (const std::exception& e) {
>     std::cerr << "Parquet error: "
>               << e.what()
>               << std::endl;
>     return -1;
>   }
>   return 0;
> }



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to