lidavidm commented on a change in pull request #10230:
URL: https://github.com/apache/arrow/pull/10230#discussion_r630431388
##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -282,65 +283,79 @@ Result<std::unique_ptr<ColumnPopulator>>
MakePopulator(const Field& field, char
return std::unique_ptr<ColumnPopulator>(factory.populator);
}
-class CSVConverter {
+class CSVConverter : public ipc::RecordBatchWriter {
public:
- static Result<std::unique_ptr<CSVConverter>> Make(std::shared_ptr<Schema>
schema,
- MemoryPool* pool) {
+ static Result<std::shared_ptr<CSVConverter>> Make(
+ io::OutputStream* sink, std::shared_ptr<io::OutputStream> owned_sink,
+ std::shared_ptr<Schema> schema, MemoryPool* pool, const WriteOptions&
options) {
+ if (!pool) pool = default_memory_pool();
std::vector<std::unique_ptr<ColumnPopulator>>
populators(schema->num_fields());
for (int col = 0; col < schema->num_fields(); col++) {
char end_char = col < schema->num_fields() - 1 ? ',' : '\n';
ASSIGN_OR_RAISE(populators[col],
MakePopulator(*schema->field(col), end_char, pool));
}
- return std::unique_ptr<CSVConverter>(
- new CSVConverter(std::move(schema), std::move(populators), pool));
+ auto writer = std::shared_ptr<CSVConverter>(
+ new CSVConverter(sink, std::move(owned_sink), std::move(schema),
+ std::move(populators), pool, options));
+ if (options.include_header) {
+ RETURN_NOT_OK(writer->PrepareForContentsWrite());
+ RETURN_NOT_OK(writer->WriteHeader());
+ }
+ return writer;
}
- Status WriteCSV(const RecordBatch& batch, const WriteOptions& options,
- io::OutputStream* out) {
- RETURN_NOT_OK(PrepareForContentsWrite(options, out));
- RecordBatchIterator iterator = RecordBatchSliceIterator(batch,
options.batch_size);
+ Status WriteRecordBatch(const RecordBatch& batch) override {
+ RETURN_NOT_OK(PrepareForContentsWrite());
+ RecordBatchIterator iterator = RecordBatchSliceIterator(batch,
options_.batch_size);
for (auto maybe_slice : iterator) {
ASSIGN_OR_RAISE(std::shared_ptr<RecordBatch> slice, maybe_slice);
RETURN_NOT_OK(TranslateMinimalBatch(*slice));
- RETURN_NOT_OK(out->Write(data_buffer_));
+ RETURN_NOT_OK(sink_->Write(data_buffer_));
+ stats_.num_record_batches++;
}
return Status::OK();
}
- Status WriteCSV(const Table& table, const WriteOptions& options,
- io::OutputStream* out) {
+ Status WriteTable(const Table& table, int64_t max_chunksize) override {
TableBatchReader reader(table);
- reader.set_chunksize(options.batch_size);
- RETURN_NOT_OK(PrepareForContentsWrite(options, out));
+ reader.set_chunksize(max_chunksize > 0 ? max_chunksize :
options_.batch_size);
+ RETURN_NOT_OK(PrepareForContentsWrite());
std::shared_ptr<RecordBatch> batch;
RETURN_NOT_OK(reader.ReadNext(&batch));
while (batch != nullptr) {
RETURN_NOT_OK(TranslateMinimalBatch(*batch));
- RETURN_NOT_OK(out->Write(data_buffer_));
+ RETURN_NOT_OK(sink_->Write(data_buffer_));
RETURN_NOT_OK(reader.ReadNext(&batch));
+ stats_.num_record_batches++;
}
return Status::OK();
}
+ Status Close() override { return Status::OK(); }
Review comment:
The IPC reader doesn't do this either, oddly. I guess it is not a Rust
'exclusively owned' sink but merely, 'keep this sink alive'. (Though, that does
beg the question: what's the point? Either you're the only one keeping it
alive, and so you should close it, or you aren't the only one, and you don't
need a shared_ptr. I would guess it's just less of a footgun to have a strong
reference than a potentially dangling one, though.)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]