pitrou commented on a change in pull request #12030:
URL: https://github.com/apache/arrow/pull/12030#discussion_r777674074
##########
File path: r/src/io.cpp
##########
@@ -178,4 +180,134 @@ void io___BufferOutputStream__Write(
StopIfNotOk(stream->Write(RAW(bytes), bytes.size()));
}
+// TransformInputStream::TransformFunc wrapper
+
+class RIconvWrapper {
+ public:
+ RIconvWrapper(std::string to, std::string from)
+ : handle_(Riconv_open(to.c_str(), from.c_str())) {
+ if (handle_ == ((void*)-1)) {
+ cpp11::stop("Can't convert encoding from '%s' to '%s'", from.c_str(),
to.c_str());
+ }
+ }
+
+ size_t iconv(const char** inbuf, size_t* inbytesleft, char** outbuf,
+ size_t* outbytesleft) {
+ return Riconv(handle_, inbuf, inbytesleft, outbuf, outbytesleft);
+ }
+
+ ~RIconvWrapper() {
+ if (handle_ != ((void*)-1)) {
+ Riconv_close(handle_);
+ }
+ }
+
+ protected:
+ void* handle_;
+};
+
+struct ReencodeUTF8TransformFunctionWrapper {
+ explicit ReencodeUTF8TransformFunctionWrapper(std::string from)
+ : from_(from), iconv_("UTF-8", from), n_pending_(0) {}
+
+ // This may get copied and we need a fresh RIconvWrapper for each copy.
+ ReencodeUTF8TransformFunctionWrapper(const
ReencodeUTF8TransformFunctionWrapper& ref)
+ : ReencodeUTF8TransformFunctionWrapper(ref.from_) {}
+
+ arrow::Result<std::shared_ptr<arrow::Buffer>> operator()(
+ const std::shared_ptr<arrow::Buffer>& src) {
+ ARROW_ASSIGN_OR_RAISE(auto dest, arrow::AllocateResizableBuffer(32));
+
+ size_t out_bytes_left = dest->size();
+ char* out_buf = (char*)dest->data();
+ size_t out_bytes_used = 0;
+
+ size_t in_bytes_left;
+ const char* in_buf;
+ int64_t n_src_bytes_in_pending = 0;
+
+ // There may be a few leftover bytes from the last call to iconv. Process
these first
+ // using the internal buffer as the source. This may also result in a
partial
+ // character left over but will always get us into the src buffer.
+ if (n_pending_ > 0) {
+ // fill the pending_ buffer with characters and call iconv() once
+ n_src_bytes_in_pending =
+ std::min<int64_t>(sizeof(pending_) - n_pending_, src->size());
+ memcpy(pending_ + n_pending_, src->data(), n_src_bytes_in_pending);
+ in_buf = pending_;
+ in_bytes_left = n_pending_ + n_src_bytes_in_pending;
+
+ iconv_.iconv(&in_buf, &in_bytes_left, &out_buf, &out_bytes_left);
+
+ int64_t chars_read_out = out_buf - ((char*)dest->data());
+ out_bytes_used += chars_read_out;
+
+ int64_t chars_read_in = n_pending_ + n_src_bytes_in_pending -
in_bytes_left;
+ in_buf = (const char*)src->data() + chars_read_in - n_pending_;
+ in_bytes_left = src->size() + n_pending_ - chars_read_in;
+ } else {
+ in_buf = (const char*)src->data();
+ in_bytes_left = src->size();
+ }
+
+ // UTF-8 has a maximum of 4 bytes per character, so it's OK if we have a
few bytes
+ // left after processing all of src. If we have more than this, it means
the
Review comment:
Why would you need a third call to `iconv()`? This could just be part of
the loop.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]