[GitHub] [arrow] pitrou commented on a change in pull request #12030: ARROW-9186: [R] Allow specifying CSV file encoding

GitBox Mon, 03 Jan 2022 11:50:40 -0800


pitrou commented on a change in pull request #12030:
URL: https://github.com/apache/arrow/pull/12030#discussion_r777673010




##########
File path: r/src/io.cpp
##########
@@ -178,4 +181,164 @@ void io___BufferOutputStream__Write(
   StopIfNotOk(stream->Write(RAW(bytes), bytes.size()));
 }
 
+// TransformInputStream::TransformFunc wrapper
+
+class RIconvWrapper {
+ public:
+  RIconvWrapper(std::string to, std::string from)
+      : handle_(Riconv_open(to.c_str(), from.c_str())) {
+    if (handle_ == ((void*)-1)) {
+      cpp11::stop("Can't convert encoding from '%s' to '%s'", from.c_str(), 
to.c_str());
+    }
+  }
+
+  size_t iconv(const uint8_t** inbuf, int64_t* inbytesleft, uint8_t** outbuf,
+               int64_t* outbytesleft) {
+    // This iconv signature uses the types that Arrow C++ uses to minimize
+    // deviations from the style guide; however, iconv() uses pointers
+    // to char* and size_t instead of uint8_t and int64_t.
+    size_t inbytesleft_size_t = *inbytesleft;
+    size_t outbytesleft_size_t = *outbytesleft;
+    const char** inbuf_const_char = reinterpret_cast<const char**>(inbuf);
+    char** outbuf_char = reinterpret_cast<char**>(outbuf);
+
+    size_t return_value = Riconv(handle_, inbuf_const_char, 
&inbytesleft_size_t,
+                                 outbuf_char, &outbytesleft_size_t);
+
+    *inbytesleft = inbytesleft_size_t;
+    *outbytesleft = outbytesleft_size_t;
+    return return_value;
+  }
+
+  ~RIconvWrapper() {
+    if (handle_ != ((void*)-1)) {
+      Riconv_close(handle_);
+    }
+  }
+
+ protected:
+  void* handle_;
+};
+
+struct ReencodeUTF8TransformFunctionWrapper {
+  explicit ReencodeUTF8TransformFunctionWrapper(std::string from)
+      : from_(from),
+        iconv_(std::make_shared<RIconvWrapper>("UTF-8", from)),
+        n_pending_(0) {}
+
+  arrow::Result<std::shared_ptr<arrow::Buffer>> operator()(
+      const std::shared_ptr<arrow::Buffer>& src) {
+    int64_t initial_size = std::min<int64_t>((src->size() + 8 * 1.2), 32);
+    ARROW_ASSIGN_OR_RAISE(auto dest, 
arrow::AllocateResizableBuffer(initial_size));
+
+    int64_t out_bytes_left = dest->size();
+    uint8_t* out_buf = dest->mutable_data();
+    int64_t out_bytes_used = 0;
+
+    int64_t in_bytes_left;
+    const uint8_t* in_buf;
+    int64_t n_src_bytes_in_pending = 0;
+
+    // There may be a few left over bytes from the last call to iconv.
+    // Process these first using the internal buffer (with as many bytes
+    // as possible added from src) as the source. This may also result in
+    // a partial character left over but will always get us into the src 
buffer.
+    if (n_pending_ > 0) {
+      n_src_bytes_in_pending =
+          std::min<int64_t>(sizeof(pending_) - n_pending_, src->size());
+      memcpy(pending_ + n_pending_, src->data(), n_src_bytes_in_pending);
+      in_buf = pending_;
+      in_bytes_left = n_pending_ + n_src_bytes_in_pending;
+
+      iconv_->iconv(&in_buf, &in_bytes_left, &out_buf, &out_bytes_left);
+
+      // Rather than check the error return code (which is often returned
+      // in the case of a partial character at the end of the pending_
+      // buffer), check that we have read enough characters to get into
+      // `src` (after which the loop below will error for invalid characters).
+      int64_t bytes_read_in = in_buf - pending_;
+      if (bytes_read_in < n_pending_) {
+        return StatusInvalidInput();
+      }
+
+      int64_t bytes_read_out = out_buf - dest->data();
+      out_bytes_used += bytes_read_out;
+
+      int64_t chars_read_in = n_pending_ + n_src_bytes_in_pending - 
in_bytes_left;
+      in_buf = src->data() + chars_read_in - n_pending_;
+      in_bytes_left = src->size() + n_pending_ - chars_read_in;
+    } else {
+      in_buf = src->data();
+      in_bytes_left = src->size();
+    }
+
+    // in_bytes_left >= 4 assumes a maximum of 4 bytes per character in the
+    // input (the maximum for UTF-8, UTF-16, and UTF-32). If we
+    // have more than this, it means the output buffer wasn't big enough
+    // and the next iteration of the loop will try iconv() again with a
+    // bigger buffer. When we have less than 4 bytes left, the bytes
+    // will be copied to pending_ and processed in the next call (the
+    // TransformInputStream always finishes with a src that is 0 bytes
+    // for this purpose.)
+    while (in_bytes_left >= 4) {
+      // When this is true, we will (almost) always need a new buffer
+      if (out_bytes_left < in_bytes_left) {
+        RETURN_NOT_OK(dest->Resize(dest->size() * 1.2));
+        out_buf = dest->mutable_data() + out_bytes_used;
+        out_bytes_left = dest->size() - out_bytes_used;
+      }
+
+      // iconv() can return an error code ((size_t) -1) but it's not
+      // useful as it can occur because of invalid input, because
+      // of a full output buffer, or because there are partial characters
+      // at the end of the input buffer that were not completely decoded.
+      // We handle each of these cases separately based on the number of bytes
+      // read or written.
+      uint8_t* out_buf_before = out_buf;
+
+      iconv_->iconv(&in_buf, &in_bytes_left, &out_buf, &out_bytes_left);
+
+      int64_t bytes_read_out = out_buf - out_buf_before;
+
+      if (bytes_read_out == 0) {
+        return StatusInvalidInput();
+      }
+
+      out_bytes_used += bytes_read_out;
+    }
+
+    // Keep the leftover characters until the next call to the function
+    n_pending_ = in_bytes_left;
+    if (in_bytes_left > 0) {
+      memcpy(pending_, in_buf, in_bytes_left);
+    }
+
+    // Shrink the output buffer to only the size used
+    RETURN_NOT_OK(dest->Resize(out_bytes_used, false));
+    return std::move(dest);
+  }
+
+ protected:
+  std::string from_;
+  std::shared_ptr<RIconvWrapper> iconv_;
+  uint8_t pending_[8];
+  int64_t n_pending_;
+
+  arrow::Status StatusInvalidInput() {
+    std::stringstream stream;
+    stream << "Encountered invalid input bytes ";
+    stream << "(input encoding was '" << from_ << "')";
+    return arrow::Status::IOError(stream.str());

Review comment:
       Nit: you don't need to use `StringStream` explicit, the Status factory 
methods will do it for you, e.g.:
   ```c++
   return Status::IOError(
       "Encountered invalid input bytes ",
       "(input encoding was '", from_, "')";
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #12030: ARROW-9186: [R] Allow specifying CSV file encoding

Reply via email to