paleolimbot commented on a change in pull request #12030:
URL: https://github.com/apache/arrow/pull/12030#discussion_r777654597



##########
File path: r/src/io.cpp
##########
@@ -178,4 +180,134 @@ void io___BufferOutputStream__Write(
   StopIfNotOk(stream->Write(RAW(bytes), bytes.size()));
 }
 
+// TransformInputStream::TransformFunc wrapper
+
+class RIconvWrapper {
+ public:
+  RIconvWrapper(std::string to, std::string from)
+      : handle_(Riconv_open(to.c_str(), from.c_str())) {
+    if (handle_ == ((void*)-1)) {
+      cpp11::stop("Can't convert encoding from '%s' to '%s'", from.c_str(), 
to.c_str());
+    }
+  }
+
+  size_t iconv(const char** inbuf, size_t* inbytesleft, char** outbuf,
+               size_t* outbytesleft) {
+    return Riconv(handle_, inbuf, inbytesleft, outbuf, outbytesleft);
+  }
+
+  ~RIconvWrapper() {
+    if (handle_ != ((void*)-1)) {
+      Riconv_close(handle_);
+    }
+  }
+
+ protected:
+  void* handle_;
+};
+
+struct ReencodeUTF8TransformFunctionWrapper {
+  explicit ReencodeUTF8TransformFunctionWrapper(std::string from)
+      : from_(from), iconv_("UTF-8", from), n_pending_(0) {}
+
+  // This may get copied and we need a fresh RIconvWrapper for each copy.
+  ReencodeUTF8TransformFunctionWrapper(const 
ReencodeUTF8TransformFunctionWrapper& ref)
+      : ReencodeUTF8TransformFunctionWrapper(ref.from_) {}
+
+  arrow::Result<std::shared_ptr<arrow::Buffer>> operator()(
+      const std::shared_ptr<arrow::Buffer>& src) {
+    ARROW_ASSIGN_OR_RAISE(auto dest, arrow::AllocateResizableBuffer(32));
+
+    size_t out_bytes_left = dest->size();
+    char* out_buf = (char*)dest->data();
+    size_t out_bytes_used = 0;
+
+    size_t in_bytes_left;
+    const char* in_buf;
+    int64_t n_src_bytes_in_pending = 0;
+
+    // There may be a few leftover bytes from the last call to iconv. Process 
these first
+    // using the internal buffer as the source. This may also result in a 
partial
+    // character left over but will always get us into the src buffer.
+    if (n_pending_ > 0) {
+      // fill the pending_ buffer with characters and call iconv() once
+      n_src_bytes_in_pending =
+          std::min<int64_t>(sizeof(pending_) - n_pending_, src->size());
+      memcpy(pending_ + n_pending_, src->data(), n_src_bytes_in_pending);
+      in_buf = pending_;
+      in_bytes_left = n_pending_ + n_src_bytes_in_pending;
+
+      iconv_.iconv(&in_buf, &in_bytes_left, &out_buf, &out_bytes_left);
+
+      int64_t chars_read_out = out_buf - ((char*)dest->data());
+      out_bytes_used += chars_read_out;
+
+      int64_t chars_read_in = n_pending_ + n_src_bytes_in_pending - 
in_bytes_left;
+      in_buf = (const char*)src->data() + chars_read_in - n_pending_;
+      in_bytes_left = src->size() + n_pending_ - chars_read_in;
+    } else {
+      in_buf = (const char*)src->data();
+      in_bytes_left = src->size();
+    }
+
+    // UTF-8 has a maximum of 4 bytes per character, so it's OK if we have a 
few bytes
+    // left after processing all of src. If we have more than this, it means 
the

Review comment:
       I'm trying to avoid too many calls to `iconv()` because every time it 
happens I have to check the number of bytes output and/or return value. The 
`TransformInputStream` always ends with an empty buffer as input to the 
transform function to flush internal buffers (according to the comments in the 
source), which is how `pending_` is used here. I think this is better than 
adding a third call to `iconv()` at the end (or perhaps there's a better option 
than all the ones I'm listing here). I've updated the comment to clarify my 
thinking here.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to