[GitHub] [arrow] emkornfield commented on a change in pull request #10230: ARROW-12512: [C++][Python][Dataset] Create CSV writer class and add Datasets support

GitBox Wed, 12 May 2021 08:53:31 -0700


emkornfield commented on a change in pull request #10230:
URL: https://github.com/apache/arrow/pull/10230#discussion_r631170861




##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -282,65 +283,79 @@ Result<std::unique_ptr<ColumnPopulator>> 
MakePopulator(const Field& field, char
   return std::unique_ptr<ColumnPopulator>(factory.populator);
 }
 
-class CSVConverter {
+class CSVConverter : public ipc::RecordBatchWriter {
  public:
-  static Result<std::unique_ptr<CSVConverter>> Make(std::shared_ptr<Schema> 
schema,
-                                                    MemoryPool* pool) {
+  static Result<std::shared_ptr<CSVConverter>> Make(
+      io::OutputStream* sink, std::shared_ptr<io::OutputStream> owned_sink,
+      std::shared_ptr<Schema> schema, MemoryPool* pool, const WriteOptions& 
options) {
+    if (!pool) pool = default_memory_pool();
     std::vector<std::unique_ptr<ColumnPopulator>> 
populators(schema->num_fields());
     for (int col = 0; col < schema->num_fields(); col++) {
       char end_char = col < schema->num_fields() - 1 ? ',' : '\n';
       ASSIGN_OR_RAISE(populators[col],
                       MakePopulator(*schema->field(col), end_char, pool));
     }
-    return std::unique_ptr<CSVConverter>(
-        new CSVConverter(std::move(schema), std::move(populators), pool));
+    auto writer = std::shared_ptr<CSVConverter>(
+        new CSVConverter(sink, std::move(owned_sink), std::move(schema),
+                         std::move(populators), pool, options));
+    if (options.include_header) {
+      RETURN_NOT_OK(writer->PrepareForContentsWrite());
+      RETURN_NOT_OK(writer->WriteHeader());
+    }
+    return writer;
   }
 
-  Status WriteCSV(const RecordBatch& batch, const WriteOptions& options,
-                  io::OutputStream* out) {
-    RETURN_NOT_OK(PrepareForContentsWrite(options, out));
-    RecordBatchIterator iterator = RecordBatchSliceIterator(batch, 
options.batch_size);
+  Status WriteRecordBatch(const RecordBatch& batch) override {
+    RETURN_NOT_OK(PrepareForContentsWrite());
+    RecordBatchIterator iterator = RecordBatchSliceIterator(batch, 
options_.batch_size);
     for (auto maybe_slice : iterator) {
       ASSIGN_OR_RAISE(std::shared_ptr<RecordBatch> slice, maybe_slice);
       RETURN_NOT_OK(TranslateMinimalBatch(*slice));
-      RETURN_NOT_OK(out->Write(data_buffer_));
+      RETURN_NOT_OK(sink_->Write(data_buffer_));
+      stats_.num_record_batches++;
     }
     return Status::OK();
   }
 
-  Status WriteCSV(const Table& table, const WriteOptions& options,
-                  io::OutputStream* out) {
+  Status WriteTable(const Table& table, int64_t max_chunksize) override {
     TableBatchReader reader(table);
-    reader.set_chunksize(options.batch_size);
-    RETURN_NOT_OK(PrepareForContentsWrite(options, out));
+    reader.set_chunksize(max_chunksize > 0 ? max_chunksize : 
options_.batch_size);
+    RETURN_NOT_OK(PrepareForContentsWrite());
     std::shared_ptr<RecordBatch> batch;
     RETURN_NOT_OK(reader.ReadNext(&batch));
     while (batch != nullptr) {
       RETURN_NOT_OK(TranslateMinimalBatch(*batch));
-      RETURN_NOT_OK(out->Write(data_buffer_));
+      RETURN_NOT_OK(sink_->Write(data_buffer_));
       RETURN_NOT_OK(reader.ReadNext(&batch));
+      stats_.num_record_batches++;
     }
 
     return Status::OK();
   }
 
+  Status Close() override { return Status::OK(); }

Review comment:
       other places I can think of (Buffer comes to mind), the way this is 
handled is passing a unique_ptr instead of a shared_ptr.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] emkornfield commented on a change in pull request #10230: ARROW-12512: [C++][Python][Dataset] Create CSV writer class and add Datasets support

Reply via email to