This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f83454c  ARROW-3788: [Ruby] Add support for CSV parser written in C++
f83454c is described below

commit f83454ca64c5b90598ccd88e004275bcbae39e75
Author: Kouhei Sutou <k...@clear-code.com>
AuthorDate: Fri Nov 16 16:31:37 2018 +0900

    ARROW-3788: [Ruby] Add support for CSV parser written in C++
    
    This is disabled by default because value conversion feature isn't enough 
for now.
    We can enable it by specifying `:use_threads` option explicitly:
    
    ```ruby
    Arrow::Table.load("xxx.csv", use_threads: true)
    ```
    
    Author: Kouhei Sutou <k...@clear-code.com>
    
    Closes #2961 from kou/ruby-csv and squashes the following commits:
    
    d59646b9 <Kouhei Sutou> Support Ruby < 2.5
    cafe5d1e <Kouhei Sutou> Add support for column type options
    71a99c75 <Kouhei Sutou>  Add support for CSV parser written in C++
---
 ruby/red-arrow/lib/arrow/csv-loader.rb             | 88 +++++++++++++++++++++-
 .../arrow/csv-read-options.rb}                     | 19 ++---
 .../lib/arrow/{csv-reader.rb => data-type.rb}      | 47 ++++--------
 ruby/red-arrow/lib/arrow/loader.rb                 |  3 +-
 ruby/red-arrow/test/test-csv-loader.rb             | 27 +++++++
 ruby/red-arrow/test/test-table.rb                  |  5 +-
 6 files changed, 140 insertions(+), 49 deletions(-)

diff --git a/ruby/red-arrow/lib/arrow/csv-loader.rb 
b/ruby/red-arrow/lib/arrow/csv-loader.rb
index f3ad6ce..3aa85bf 100644
--- a/ruby/red-arrow/lib/arrow/csv-loader.rb
+++ b/ruby/red-arrow/lib/arrow/csv-loader.rb
@@ -60,11 +60,72 @@ module Arrow
     end
 
     def read_csv(csv)
-      reader = CSVReader.new(csv)
-      reader.read
+      values_set = []
+      csv.each do |row|
+        if row.is_a?(CSV::Row)
+          row = row.collect(&:last)
+        end
+        row.each_with_index do |value, i|
+          values = (values_set[i] ||= [])
+          values << value
+        end
+      end
+      return nil if values_set.empty?
+
+      arrays = values_set.collect.with_index do |values, i|
+        ArrayBuilder.build(values)
+      end
+      if csv.headers
+        names = csv.headers
+      else
+        names = arrays.size.times.collect(&:to_s)
+      end
+      raw_table = {}
+      names.each_with_index do |name, i|
+        raw_table[name] = arrays[i]
+      end
+      Table.new(raw_table)
+    end
+
+    def reader_options
+      options = CSVReadOptions.new
+      @options.each do |key, value|
+        case key
+        when :headers
+          if value
+            options.n_header_rows = 1
+          else
+            options.n_header_rows = 0
+          end
+        when :column_types
+          value.each do |name, type|
+            options.add_column_type(name, type)
+          end
+        when :schema
+          options.add_schema(value)
+        else
+          setter = "#{key}="
+          if options.respond_to?(setter)
+            options.__send__(setter, value)
+          else
+            return nil
+          end
+        end
+      end
+      options
     end
 
     def load_from_path(path)
+      options = reader_options
+      if options
+        begin
+          MemoryMappedInputStream.open(path.to_s) do |input|
+            return CSVReader.new(input, options).read
+          end
+        rescue Arrow::Error::Invalid
+        end
+      end
+
       options = update_csv_parse_options(@options, :open_csv, path)
       open_csv(path, **options) do |csv|
         read_csv(csv)
@@ -72,6 +133,16 @@ module Arrow
     end
 
     def load_data(data)
+      options = reader_options
+      if options
+        begin
+          BufferInputStream.open(Buffer.new(data)) do |input|
+            return CSVReader.new(input, options).read
+          end
+        rescue Arrow::Error::Invalid
+        end
+      end
+
       options = update_csv_parse_options(@options, :parse_csv_data, data)
       parse_csv_data(data, **options) do |csv|
         read_csv(csv)
@@ -119,6 +190,11 @@ module Arrow
       end
     end
 
+    AVAILABLE_CSV_PARSE_OPTIONS = {}
+    CSV.instance_method(:initialize).parameters.each do |type, name|
+      AVAILABLE_CSV_PARSE_OPTIONS[name] = true if type == :key
+    end
+
     def update_csv_parse_options(options, create_csv, *args)
       if options.key?(:converters)
         new_options = options.dup
@@ -127,6 +203,14 @@ module Arrow
         new_options = options.merge(converters: converters)
       end
 
+      # TODO: Support :schema and :column_types
+
+      unless AVAILABLE_CSV_PARSE_OPTIONS.empty?
+        new_options.select! do |key, value|
+          AVAILABLE_CSV_PARSE_OPTIONS.key?(key)
+        end
+      end
+
       unless options.key?(:headers)
         __send__(create_csv, *args, **new_options) do |csv|
           new_options[:headers] = have_header?(csv)
diff --git a/ruby/red-arrow/test/test-csv-reader.rb 
b/ruby/red-arrow/lib/arrow/csv-read-options.rb
similarity index 69%
rename from ruby/red-arrow/test/test-csv-reader.rb
rename to ruby/red-arrow/lib/arrow/csv-read-options.rb
index 765a178..ad46d80 100644
--- a/ruby/red-arrow/test/test-csv-reader.rb
+++ b/ruby/red-arrow/lib/arrow/csv-read-options.rb
@@ -15,20 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-class CSVReaderTest < Test::Unit::TestCase
-  include Helper::Fixture
-
-  test("#read") do
-    CSV.open(fixture_path("with-header.csv").to_s,
-             headers: true,
-             skip_lines: /^#/) do |csv|
-      reader = Arrow::CSVReader.new(csv)
-      assert_equal(<<-TABLE, reader.read.to_s)
-       name    score
-0      alice   10   
-1      bob     29   
-2      chris   -1   
-      TABLE
+module Arrow
+  class CSVReadOptions
+    alias_method :add_column_type_raw, :add_column_type
+    def add_column_type(name, type)
+      add_column_type_raw(name, DataType.resolve(type))
     end
   end
 end
diff --git a/ruby/red-arrow/lib/arrow/csv-reader.rb 
b/ruby/red-arrow/lib/arrow/data-type.rb
similarity index 53%
rename from ruby/red-arrow/lib/arrow/csv-reader.rb
rename to ruby/red-arrow/lib/arrow/data-type.rb
index 4a596f0..dad74fb 100644
--- a/ruby/red-arrow/lib/arrow/csv-reader.rb
+++ b/ruby/red-arrow/lib/arrow/data-type.rb
@@ -15,40 +15,25 @@
 # specific language governing permissions and limitations
 # under the License.
 
-require "csv"
-
 module Arrow
-  class CSVReader
-    def initialize(csv)
-      @csv = csv
-    end
-
-    def read
-      values_set = []
-      @csv.each do |row|
-        if row.is_a?(CSV::Row)
-          row = row.collect(&:last)
-        end
-        row.each_with_index do |value, i|
-          values = (values_set[i] ||= [])
-          values << value
+  class DataType
+    class << self
+      def resolve(data_type)
+        case data_type
+        when DataType
+          data_type
+        when String, Symbol
+          data_type_name = data_type.to_s.capitalize.gsub(/\AUint/, "UInt")
+          data_type_class_name = "#{data_type_name}DataType"
+          unless Arrow.const_defined?(data_type_class_name)
+            raise ArgumentError, "invalid data type: #{data_typeinspect}"
+          end
+          data_type_class = Arrow.const_get(data_type_class_name)
+          data_type_class.new
+        else
+          raise ArgumentError, "invalid data type: #{data_type.inspect}"
         end
       end
-      return nil if values_set.empty?
-
-      arrays = values_set.collect.with_index do |values, i|
-        ArrayBuilder.build(values)
-      end
-      if @csv.headers
-        names = @csv.headers
-      else
-        names = arrays.size.times.collect(&:to_s)
-      end
-      raw_table = {}
-      names.each_with_index do |name, i|
-        raw_table[name] = arrays[i]
-      end
-      Table.new(raw_table)
     end
   end
 end
diff --git a/ruby/red-arrow/lib/arrow/loader.rb 
b/ruby/red-arrow/lib/arrow/loader.rb
index 83555d1..e147113 100644
--- a/ruby/red-arrow/lib/arrow/loader.rb
+++ b/ruby/red-arrow/lib/arrow/loader.rb
@@ -36,7 +36,8 @@ module Arrow
       require "arrow/chunked-array"
       require "arrow/column"
       require "arrow/csv-loader"
-      require "arrow/csv-reader"
+      require "arrow/csv-read-options"
+      require "arrow/data-type"
       require "arrow/date32-array"
       require "arrow/date32-array-builder"
       require "arrow/date64-array"
diff --git a/ruby/red-arrow/test/test-csv-loader.rb 
b/ruby/red-arrow/test/test-csv-loader.rb
index 26ea497..b5b8278 100644
--- a/ruby/red-arrow/test/test-csv-loader.rb
+++ b/ruby/red-arrow/test/test-csv-loader.rb
@@ -115,4 +115,31 @@ class CSVLoaderTest < Test::Unit::TestCase
                    load_csv(path)[:score].to_a)
     end
   end
+
+  sub_test_case("CSVReader") do
+    def load_csv(data, options)
+      Arrow::CSVLoader.load(data, options)
+    end
+
+    test(":column_types") do
+      assert_equal(Arrow::Table.new(:count => Arrow::UInt16Array.new([1, 2, 
4])),
+                   load_csv(<<-CSV, column_types: {count: :uint16}))
+count
+1
+2
+4
+                   CSV
+    end
+
+    test(":schema") do
+      table = Arrow::Table.new(:count => Arrow::UInt16Array.new([1, 2, 4]))
+      assert_equal(table,
+                   load_csv(<<-CSV, schema: table.schema))
+count
+1
+2
+4
+                   CSV
+    end
+  end
 end
diff --git a/ruby/red-arrow/test/test-table.rb 
b/ruby/red-arrow/test/test-table.rb
index 3fe6316..3eaaf63 100644
--- a/ruby/red-arrow/test/test-table.rb
+++ b/ruby/red-arrow/test/test-table.rb
@@ -417,7 +417,10 @@ class TableTest < Test::Unit::TestCase
       test(":csv") do
         file = Tempfile.new(["red-arrow", ".csv"])
         @table.save(file.path, :format => :csv)
-        assert_equal(@table, Arrow::Table.load(file.path, :format => :csv))
+        assert_equal(@table,
+                     Arrow::Table.load(file.path,
+                                       :format => :csv,
+                                       :schema => @table.schema))
       end
 
       sub_test_case("load: auto detect") do

Reply via email to