This is an automated email from the ASF dual-hosted git repository.
shiro pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5b87c55 ARROW-4981: [Ruby] Add support for CSV data encoding
conversion
5b87c55 is described below
commit 5b87c553d6c1d13280af48b919f0269f3ad686a4
Author: Kouhei Sutou <[email protected]>
AuthorDate: Sat Mar 23 09:39:21 2019 +0900
ARROW-4981: [Ruby] Add support for CSV data encoding conversion
Author: Kouhei Sutou <[email protected]>
Closes #3998 from kou/ruby-csv-encoding and squashes the following commits:
6f372ea2 <Kouhei Sutou> Add support for CSV data encoding conversion
---
ruby/red-arrow/lib/arrow.rb | 2 +-
ruby/red-arrow/lib/arrow/csv-loader.rb | 34 ++++++++++++++++++++++++++++------
ruby/red-arrow/red-arrow.gemspec | 2 +-
ruby/red-arrow/test/test-csv-loader.rb | 30 ++++++++++++++++++++++++++++++
4 files changed, 60 insertions(+), 8 deletions(-)
diff --git a/ruby/red-arrow/lib/arrow.rb b/ruby/red-arrow/lib/arrow.rb
index 95dabee..257bd28 100644
--- a/ruby/red-arrow/lib/arrow.rb
+++ b/ruby/red-arrow/lib/arrow.rb
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-require "gobject-introspection"
+require "gio2"
require "arrow/version"
diff --git a/ruby/red-arrow/lib/arrow/csv-loader.rb
b/ruby/red-arrow/lib/arrow/csv-loader.rb
index bb1f419..1839590 100644
--- a/ruby/red-arrow/lib/arrow/csv-loader.rb
+++ b/ruby/red-arrow/lib/arrow/csv-loader.rb
@@ -104,6 +104,8 @@ module Arrow
end
when :schema
options.add_schema(value)
+ when :encoding
+ # process encoding on opening input
else
setter = "#{key}="
if options.respond_to?(setter)
@@ -116,7 +118,7 @@ module Arrow
options
end
- def open_input(raw_input)
+ def open_decompress_input(raw_input)
if @compression
codec = Codec.new(@compression)
CompressedInputStream.open(codec, raw_input) do |input|
@@ -127,16 +129,36 @@ module Arrow
end
end
+ def open_encoding_convert_stream(raw_input, &block)
+ encoding = @options[:encoding]
+ if encoding
+ converter = Gio::CharsetConverter.new("UTF-8", encoding)
+ convert_input_stream =
+ Gio::ConverterInputStream.new(raw_input, converter)
+ GIOInputStream.open(convert_input_stream, &block)
+ else
+ yield(raw_input)
+ end
+ end
+
+ def wrap_input(raw_input)
+ open_decompress_input(raw_input) do |input_|
+ open_encoding_convert_stream(input_) do |input__|
+ yield(input__)
+ end
+ end
+ end
+
def load_from_path(path)
options = reader_options
if options
begin
- MemoryMappedInputStream.open(path.to_s) do |raw_input|
- open_input(raw_input) do |input|
+ MemoryMappedInputStream.open(path) do |raw_input|
+ wrap_input(raw_input) do |input|
return CSVReader.new(input, options).read
end
end
- rescue Arrow::Error::Invalid
+ rescue Arrow::Error::Invalid, Gio::Error
end
end
@@ -151,11 +173,11 @@ module Arrow
if options
begin
BufferInputStream.open(Buffer.new(data)) do |raw_input|
- open_input(raw_input) do |input|
+ wrap_input(raw_input) do |input|
return CSVReader.new(input, options).read
end
end
- rescue Arrow::Error::Invalid
+ rescue Arrow::Error::Invalid, Gio::Error
end
end
diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec
index 7c6320e..317bcf2 100644
--- a/ruby/red-arrow/red-arrow.gemspec
+++ b/ruby/red-arrow/red-arrow.gemspec
@@ -47,7 +47,7 @@ Gem::Specification.new do |spec|
spec.extensions = ["ext/arrow/extconf.rb"]
spec.add_runtime_dependency("extpp")
- spec.add_runtime_dependency("gobject-introspection", ">= 3.3.5")
+ spec.add_runtime_dependency("gio2", ">= 3.3.6")
spec.add_runtime_dependency("native-package-installer")
spec.add_runtime_dependency("pkg-config")
diff --git a/ruby/red-arrow/test/test-csv-loader.rb
b/ruby/red-arrow/test/test-csv-loader.rb
index b5b8278..96de9c8 100644
--- a/ruby/red-arrow/test/test-csv-loader.rb
+++ b/ruby/red-arrow/test/test-csv-loader.rb
@@ -141,5 +141,35 @@ count
4
CSV
end
+
+ test(":encoding") do
+ messages = [
+ "\u3042", # U+3042 HIRAGANA LETTER A
+ "\u3044", # U+3044 HIRAGANA LETTER I
+ "\u3046", # U+3046 HIRAGANA LETTER U
+ ]
+ table = Arrow::Table.new(:message => Arrow::StringArray.new(messages))
+ encoding = "cp932"
+ assert_equal(table,
+ load_csv((["message"] +
messages).join("\n").encode(encoding),
+ schema: table.schema,
+ encoding: encoding))
+ end
+
+ test(":encoding and :compression") do
+ messages = [
+ "\u3042", # U+3042 HIRAGANA LETTER A
+ "\u3044", # U+3044 HIRAGANA LETTER I
+ "\u3046", # U+3046 HIRAGANA LETTER U
+ ]
+ table = Arrow::Table.new(:message => Arrow::StringArray.new(messages))
+ encoding = "cp932"
+ csv = (["message"] + messages).join("\n").encode(encoding)
+ assert_equal(table,
+ load_csv(Zlib::Deflate.deflate(csv),
+ schema: table.schema,
+ encoding: encoding,
+ compression: :gzip))
+ end
end
end