This is an automated email from the ASF dual-hosted git repository.

shiro pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5b87c55  ARROW-4981: [Ruby] Add support for CSV data encoding 
conversion
5b87c55 is described below

commit 5b87c553d6c1d13280af48b919f0269f3ad686a4
Author: Kouhei Sutou <[email protected]>
AuthorDate: Sat Mar 23 09:39:21 2019 +0900

    ARROW-4981: [Ruby] Add support for CSV data encoding conversion
    
    Author: Kouhei Sutou <[email protected]>
    
    Closes #3998 from kou/ruby-csv-encoding and squashes the following commits:
    
    6f372ea2 <Kouhei Sutou>  Add support for CSV data encoding conversion
---
 ruby/red-arrow/lib/arrow.rb            |  2 +-
 ruby/red-arrow/lib/arrow/csv-loader.rb | 34 ++++++++++++++++++++++++++++------
 ruby/red-arrow/red-arrow.gemspec       |  2 +-
 ruby/red-arrow/test/test-csv-loader.rb | 30 ++++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/ruby/red-arrow/lib/arrow.rb b/ruby/red-arrow/lib/arrow.rb
index 95dabee..257bd28 100644
--- a/ruby/red-arrow/lib/arrow.rb
+++ b/ruby/red-arrow/lib/arrow.rb
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-require "gobject-introspection"
+require "gio2"
 
 require "arrow/version"
 
diff --git a/ruby/red-arrow/lib/arrow/csv-loader.rb 
b/ruby/red-arrow/lib/arrow/csv-loader.rb
index bb1f419..1839590 100644
--- a/ruby/red-arrow/lib/arrow/csv-loader.rb
+++ b/ruby/red-arrow/lib/arrow/csv-loader.rb
@@ -104,6 +104,8 @@ module Arrow
           end
         when :schema
           options.add_schema(value)
+        when :encoding
+          # process encoding on opening input
         else
           setter = "#{key}="
           if options.respond_to?(setter)
@@ -116,7 +118,7 @@ module Arrow
       options
     end
 
-    def open_input(raw_input)
+    def open_decompress_input(raw_input)
       if @compression
         codec = Codec.new(@compression)
         CompressedInputStream.open(codec, raw_input) do |input|
@@ -127,16 +129,36 @@ module Arrow
       end
     end
 
+    def open_encoding_convert_stream(raw_input, &block)
+      encoding = @options[:encoding]
+      if encoding
+        converter = Gio::CharsetConverter.new("UTF-8", encoding)
+        convert_input_stream =
+          Gio::ConverterInputStream.new(raw_input, converter)
+        GIOInputStream.open(convert_input_stream, &block)
+      else
+        yield(raw_input)
+      end
+    end
+
+    def wrap_input(raw_input)
+      open_decompress_input(raw_input) do |input_|
+        open_encoding_convert_stream(input_) do |input__|
+          yield(input__)
+        end
+      end
+    end
+
     def load_from_path(path)
       options = reader_options
       if options
         begin
-          MemoryMappedInputStream.open(path.to_s) do |raw_input|
-            open_input(raw_input) do |input|
+          MemoryMappedInputStream.open(path) do |raw_input|
+            wrap_input(raw_input) do |input|
               return CSVReader.new(input, options).read
             end
           end
-        rescue Arrow::Error::Invalid
+        rescue Arrow::Error::Invalid, Gio::Error
         end
       end
 
@@ -151,11 +173,11 @@ module Arrow
       if options
         begin
           BufferInputStream.open(Buffer.new(data)) do |raw_input|
-            open_input(raw_input) do |input|
+            wrap_input(raw_input) do |input|
               return CSVReader.new(input, options).read
             end
           end
-        rescue Arrow::Error::Invalid
+        rescue Arrow::Error::Invalid, Gio::Error
         end
       end
 
diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec
index 7c6320e..317bcf2 100644
--- a/ruby/red-arrow/red-arrow.gemspec
+++ b/ruby/red-arrow/red-arrow.gemspec
@@ -47,7 +47,7 @@ Gem::Specification.new do |spec|
   spec.extensions = ["ext/arrow/extconf.rb"]
 
   spec.add_runtime_dependency("extpp")
-  spec.add_runtime_dependency("gobject-introspection", ">= 3.3.5")
+  spec.add_runtime_dependency("gio2", ">= 3.3.6")
   spec.add_runtime_dependency("native-package-installer")
   spec.add_runtime_dependency("pkg-config")
 
diff --git a/ruby/red-arrow/test/test-csv-loader.rb 
b/ruby/red-arrow/test/test-csv-loader.rb
index b5b8278..96de9c8 100644
--- a/ruby/red-arrow/test/test-csv-loader.rb
+++ b/ruby/red-arrow/test/test-csv-loader.rb
@@ -141,5 +141,35 @@ count
 4
                    CSV
     end
+
+    test(":encoding") do
+      messages = [
+        "\u3042", # U+3042 HIRAGANA LETTER A
+        "\u3044", # U+3044 HIRAGANA LETTER I
+        "\u3046", # U+3046 HIRAGANA LETTER U
+      ]
+      table = Arrow::Table.new(:message => Arrow::StringArray.new(messages))
+      encoding = "cp932"
+      assert_equal(table,
+                   load_csv((["message"] + 
messages).join("\n").encode(encoding),
+                            schema: table.schema,
+                            encoding: encoding))
+    end
+
+    test(":encoding and :compression") do
+      messages = [
+        "\u3042", # U+3042 HIRAGANA LETTER A
+        "\u3044", # U+3044 HIRAGANA LETTER I
+        "\u3046", # U+3046 HIRAGANA LETTER U
+      ]
+      table = Arrow::Table.new(:message => Arrow::StringArray.new(messages))
+      encoding = "cp932"
+      csv = (["message"] + messages).join("\n").encode(encoding)
+      assert_equal(table,
+                   load_csv(Zlib::Deflate.deflate(csv),
+                            schema: table.schema,
+                            encoding: encoding,
+                            compression: :gzip))
+    end
   end
 end

Reply via email to