emkornfield commented on a change in pull request #8648: URL: https://github.com/apache/arrow/pull/8648#discussion_r567602645
########## File path: cpp/src/arrow/adapters/orc/adapter.cc ########## @@ -473,6 +474,107 @@ int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); } int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); } +class ArrowOutputStream : public liborc::OutputStream { + public: + explicit ArrowOutputStream(arrow::io::OutputStream& output_stream) + : output_stream_(output_stream), length_(0) {} + + uint64_t getLength() const override { return length_; } + + uint64_t getNaturalWriteSize() const override { return ORC_NATURAL_WRITE_SIZE; } + + void write(const void* buf, size_t length) override { + ORC_THROW_NOT_OK(output_stream_.Write(buf, static_cast<int64_t>(length))); + length_ += static_cast<int64_t>(length); + } + + const std::string& getName() const override { + static const std::string filename("ArrowOutputFile"); + return filename; + } + + void close() override { + if (!output_stream_.closed()) { + ORC_THROW_NOT_OK(output_stream_.Close()); + } + } + + void set_length(int64_t length) { length_ = length; } + + private: + arrow::io::OutputStream& output_stream_; + int64_t length_; +}; + +class ORCFileWriter::Impl { + public: + Status Open(arrow::io::OutputStream& output_stream) { + out_stream_ = std::unique_ptr<liborc::OutputStream>( + static_cast<liborc::OutputStream*>(new ArrowOutputStream(output_stream))); + return Status::OK(); + } + Status Write(const Table& table) { + std::unique_ptr<liborc::WriterOptions> orc_options = + std::unique_ptr<liborc::WriterOptions>(new liborc::WriterOptions()); + std::unique_ptr<liborc::Type> orc_schema; + RETURN_NOT_OK(GetORCType(*(table.schema()), &orc_schema)); + try { + writer_ = createWriter(*orc_schema, out_stream_.get(), *orc_options); + } catch (const liborc::ParseError& e) { + return Status::IOError(e.what()); + } + int64_t num_rows = table.num_rows(); + const int num_cols_ = table.num_columns(); + const int64_t batch_size = 1024; // Doesn't matter what it is Review comment: typically batch sizes have implications for memory and file layouts, so a better explanation for readers not familiar with ORC would be useful. 1024 is actually pretty small if only 1024 rows are stored by group. Typically would want go by memory usage but that can be difficult so larger batch size could potentially be better. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org