This is an automated email from the ASF dual-hosted git repository. kou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new f83454c ARROW-3788: [Ruby] Add support for CSV parser written in C++ f83454c is described below commit f83454ca64c5b90598ccd88e004275bcbae39e75 Author: Kouhei Sutou <k...@clear-code.com> AuthorDate: Fri Nov 16 16:31:37 2018 +0900 ARROW-3788: [Ruby] Add support for CSV parser written in C++ This is disabled by default because value conversion feature isn't enough for now. We can enable it by specifying `:use_threads` option explicitly: ```ruby Arrow::Table.load("xxx.csv", use_threads: true) ``` Author: Kouhei Sutou <k...@clear-code.com> Closes #2961 from kou/ruby-csv and squashes the following commits: d59646b9 <Kouhei Sutou> Support Ruby < 2.5 cafe5d1e <Kouhei Sutou> Add support for column type options 71a99c75 <Kouhei Sutou> Add support for CSV parser written in C++ --- ruby/red-arrow/lib/arrow/csv-loader.rb | 88 +++++++++++++++++++++- .../arrow/csv-read-options.rb} | 19 ++--- .../lib/arrow/{csv-reader.rb => data-type.rb} | 47 ++++-------- ruby/red-arrow/lib/arrow/loader.rb | 3 +- ruby/red-arrow/test/test-csv-loader.rb | 27 +++++++ ruby/red-arrow/test/test-table.rb | 5 +- 6 files changed, 140 insertions(+), 49 deletions(-) diff --git a/ruby/red-arrow/lib/arrow/csv-loader.rb b/ruby/red-arrow/lib/arrow/csv-loader.rb index f3ad6ce..3aa85bf 100644 --- a/ruby/red-arrow/lib/arrow/csv-loader.rb +++ b/ruby/red-arrow/lib/arrow/csv-loader.rb @@ -60,11 +60,72 @@ module Arrow end def read_csv(csv) - reader = CSVReader.new(csv) - reader.read + values_set = [] + csv.each do |row| + if row.is_a?(CSV::Row) + row = row.collect(&:last) + end + row.each_with_index do |value, i| + values = (values_set[i] ||= []) + values << value + end + end + return nil if values_set.empty? + + arrays = values_set.collect.with_index do |values, i| + ArrayBuilder.build(values) + end + if csv.headers + names = csv.headers + else + names = arrays.size.times.collect(&:to_s) + end + raw_table = {} + names.each_with_index do |name, i| + raw_table[name] = arrays[i] + end + Table.new(raw_table) + end + + def reader_options + options = CSVReadOptions.new + @options.each do |key, value| + case key + when :headers + if value + options.n_header_rows = 1 + else + options.n_header_rows = 0 + end + when :column_types + value.each do |name, type| + options.add_column_type(name, type) + end + when :schema + options.add_schema(value) + else + setter = "#{key}=" + if options.respond_to?(setter) + options.__send__(setter, value) + else + return nil + end + end + end + options end def load_from_path(path) + options = reader_options + if options + begin + MemoryMappedInputStream.open(path.to_s) do |input| + return CSVReader.new(input, options).read + end + rescue Arrow::Error::Invalid + end + end + options = update_csv_parse_options(@options, :open_csv, path) open_csv(path, **options) do |csv| read_csv(csv) @@ -72,6 +133,16 @@ module Arrow end def load_data(data) + options = reader_options + if options + begin + BufferInputStream.open(Buffer.new(data)) do |input| + return CSVReader.new(input, options).read + end + rescue Arrow::Error::Invalid + end + end + options = update_csv_parse_options(@options, :parse_csv_data, data) parse_csv_data(data, **options) do |csv| read_csv(csv) @@ -119,6 +190,11 @@ module Arrow end end + AVAILABLE_CSV_PARSE_OPTIONS = {} + CSV.instance_method(:initialize).parameters.each do |type, name| + AVAILABLE_CSV_PARSE_OPTIONS[name] = true if type == :key + end + def update_csv_parse_options(options, create_csv, *args) if options.key?(:converters) new_options = options.dup @@ -127,6 +203,14 @@ module Arrow new_options = options.merge(converters: converters) end + # TODO: Support :schema and :column_types + + unless AVAILABLE_CSV_PARSE_OPTIONS.empty? + new_options.select! do |key, value| + AVAILABLE_CSV_PARSE_OPTIONS.key?(key) + end + end + unless options.key?(:headers) __send__(create_csv, *args, **new_options) do |csv| new_options[:headers] = have_header?(csv) diff --git a/ruby/red-arrow/test/test-csv-reader.rb b/ruby/red-arrow/lib/arrow/csv-read-options.rb similarity index 69% rename from ruby/red-arrow/test/test-csv-reader.rb rename to ruby/red-arrow/lib/arrow/csv-read-options.rb index 765a178..ad46d80 100644 --- a/ruby/red-arrow/test/test-csv-reader.rb +++ b/ruby/red-arrow/lib/arrow/csv-read-options.rb @@ -15,20 +15,11 @@ # specific language governing permissions and limitations # under the License. -class CSVReaderTest < Test::Unit::TestCase - include Helper::Fixture - - test("#read") do - CSV.open(fixture_path("with-header.csv").to_s, - headers: true, - skip_lines: /^#/) do |csv| - reader = Arrow::CSVReader.new(csv) - assert_equal(<<-TABLE, reader.read.to_s) - name score -0 alice 10 -1 bob 29 -2 chris -1 - TABLE +module Arrow + class CSVReadOptions + alias_method :add_column_type_raw, :add_column_type + def add_column_type(name, type) + add_column_type_raw(name, DataType.resolve(type)) end end end diff --git a/ruby/red-arrow/lib/arrow/csv-reader.rb b/ruby/red-arrow/lib/arrow/data-type.rb similarity index 53% rename from ruby/red-arrow/lib/arrow/csv-reader.rb rename to ruby/red-arrow/lib/arrow/data-type.rb index 4a596f0..dad74fb 100644 --- a/ruby/red-arrow/lib/arrow/csv-reader.rb +++ b/ruby/red-arrow/lib/arrow/data-type.rb @@ -15,40 +15,25 @@ # specific language governing permissions and limitations # under the License. -require "csv" - module Arrow - class CSVReader - def initialize(csv) - @csv = csv - end - - def read - values_set = [] - @csv.each do |row| - if row.is_a?(CSV::Row) - row = row.collect(&:last) - end - row.each_with_index do |value, i| - values = (values_set[i] ||= []) - values << value + class DataType + class << self + def resolve(data_type) + case data_type + when DataType + data_type + when String, Symbol + data_type_name = data_type.to_s.capitalize.gsub(/\AUint/, "UInt") + data_type_class_name = "#{data_type_name}DataType" + unless Arrow.const_defined?(data_type_class_name) + raise ArgumentError, "invalid data type: #{data_typeinspect}" + end + data_type_class = Arrow.const_get(data_type_class_name) + data_type_class.new + else + raise ArgumentError, "invalid data type: #{data_type.inspect}" end end - return nil if values_set.empty? - - arrays = values_set.collect.with_index do |values, i| - ArrayBuilder.build(values) - end - if @csv.headers - names = @csv.headers - else - names = arrays.size.times.collect(&:to_s) - end - raw_table = {} - names.each_with_index do |name, i| - raw_table[name] = arrays[i] - end - Table.new(raw_table) end end end diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb index 83555d1..e147113 100644 --- a/ruby/red-arrow/lib/arrow/loader.rb +++ b/ruby/red-arrow/lib/arrow/loader.rb @@ -36,7 +36,8 @@ module Arrow require "arrow/chunked-array" require "arrow/column" require "arrow/csv-loader" - require "arrow/csv-reader" + require "arrow/csv-read-options" + require "arrow/data-type" require "arrow/date32-array" require "arrow/date32-array-builder" require "arrow/date64-array" diff --git a/ruby/red-arrow/test/test-csv-loader.rb b/ruby/red-arrow/test/test-csv-loader.rb index 26ea497..b5b8278 100644 --- a/ruby/red-arrow/test/test-csv-loader.rb +++ b/ruby/red-arrow/test/test-csv-loader.rb @@ -115,4 +115,31 @@ class CSVLoaderTest < Test::Unit::TestCase load_csv(path)[:score].to_a) end end + + sub_test_case("CSVReader") do + def load_csv(data, options) + Arrow::CSVLoader.load(data, options) + end + + test(":column_types") do + assert_equal(Arrow::Table.new(:count => Arrow::UInt16Array.new([1, 2, 4])), + load_csv(<<-CSV, column_types: {count: :uint16})) +count +1 +2 +4 + CSV + end + + test(":schema") do + table = Arrow::Table.new(:count => Arrow::UInt16Array.new([1, 2, 4])) + assert_equal(table, + load_csv(<<-CSV, schema: table.schema)) +count +1 +2 +4 + CSV + end + end end diff --git a/ruby/red-arrow/test/test-table.rb b/ruby/red-arrow/test/test-table.rb index 3fe6316..3eaaf63 100644 --- a/ruby/red-arrow/test/test-table.rb +++ b/ruby/red-arrow/test/test-table.rb @@ -417,7 +417,10 @@ class TableTest < Test::Unit::TestCase test(":csv") do file = Tempfile.new(["red-arrow", ".csv"]) @table.save(file.path, :format => :csv) - assert_equal(@table, Arrow::Table.load(file.path, :format => :csv)) + assert_equal(@table, + Arrow::Table.load(file.path, + :format => :csv, + :schema => @table.schema)) end sub_test_case("load: auto detect") do