This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 7d3f7b3f8b GH-40573: [GLib][Ruby][CSV] Add support for customizing 
timestamp parsers (#40590)
7d3f7b3f8b is described below

commit 7d3f7b3f8b22cd7fbd3a69d6dcab7716dee79202
Author: Sutou Kouhei <[email protected]>
AuthorDate: Mon Mar 18 15:22:46 2024 +0900

    GH-40573: [GLib][Ruby][CSV] Add support for customizing timestamp parsers 
(#40590)
    
    ### Rationale for this change
    
    ISO8601 timestamp values in CSV can be parsed by default but non-ISO8601 
timestamp values can't.
    
    ### What changes are included in this PR?
    
    * Add `garrow_csv_read_options_set_timestamp_parsers()`
    * Add `garrow_csv_read_options_get_timestamp_parsers()`
    * Add `garrow_csv_read_options_add_timestamp_parser()`
    * Add `Arrow::TimestampParser.try_convert` for implicit cast
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    Yes.
    * GitHub Issue: #40573
    
    Authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 c_glib/arrow-glib/reader.cpp                 | 83 +++++++++++++++++++++++++++-
 c_glib/arrow-glib/reader.h                   | 18 ++++--
 c_glib/test/test-csv-reader.rb               | 15 +++++
 ruby/red-arrow/lib/arrow/loader.rb           |  1 +
 ruby/red-arrow/lib/arrow/timestamp-parser.rb | 33 +++++++++++
 ruby/red-arrow/test/test-csv-loader.rb       | 37 +++++++++++++
 6 files changed, 181 insertions(+), 6 deletions(-)

diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index 0b388a4ba3..8a1c3722d4 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -29,6 +29,7 @@
 #include <arrow-glib/record-batch.hpp>
 #include <arrow-glib/schema.hpp>
 #include <arrow-glib/table.hpp>
+#include <arrow-glib/timestamp-parser.hpp>
 
 #include <arrow/c/bridge.h>
 
@@ -872,12 +873,13 @@ 
garrow_feather_file_reader_read_names(GArrowFeatherFileReader *reader,
   }
 }
 
-typedef struct GArrowCSVReadOptionsPrivate_
+struct GArrowCSVReadOptionsPrivate
 {
   arrow::csv::ReadOptions read_options;
   arrow::csv::ParseOptions parse_options;
   arrow::csv::ConvertOptions convert_options;
-} GArrowCSVReadOptionsPrivate;
+  GList *timestamp_parsers;
+};
 
 enum {
   PROP_USE_THREADS = 1,
@@ -902,6 +904,17 @@ G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVReadOptions, 
garrow_csv_read_options, G_TYPE
   static_cast<GArrowCSVReadOptionsPrivate *>(                                  
          \
     
garrow_csv_read_options_get_instance_private(GARROW_CSV_READ_OPTIONS(object)))
 
+static void
+garrow_csv_read_options_dispose(GObject *object)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(object);
+
+  g_list_free_full(priv->timestamp_parsers, g_object_unref);
+  priv->timestamp_parsers = nullptr;
+
+  G_OBJECT_CLASS(garrow_csv_read_options_parent_class)->dispose(object);
+}
+
 static void
 garrow_csv_read_options_set_property(GObject *object,
                                      guint prop_id,
@@ -1032,6 +1045,7 @@ 
garrow_csv_read_options_class_init(GArrowCSVReadOptionsClass *klass)
 
   auto gobject_class = G_OBJECT_CLASS(klass);
 
+  gobject_class->dispose = garrow_csv_read_options_dispose;
   gobject_class->set_property = garrow_csv_read_options_set_property;
   gobject_class->get_property = garrow_csv_read_options_get_property;
 
@@ -1623,6 +1637,71 @@ 
garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options,
   priv->read_options.column_names.push_back(column_name);
 }
 
+/**
+ * garrow_csv_read_options_set_timestamp_parsers:
+ * @options: A #GArrowCSVReadOptions.
+ * @parsers: (element-type GArrowTimestampParser): The list of
+ *   #GArrowTimestampParser to be added.
+ *
+ * Since: 16.0.0
+ */
+void
+garrow_csv_read_options_set_timestamp_parsers(GArrowCSVReadOptions *options,
+                                              GList *parsers)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  g_list_free_full(priv->timestamp_parsers, g_object_unref);
+  priv->convert_options.timestamp_parsers.clear();
+  for (auto node = parsers; node; node = g_list_next(node)) {
+    if (!node->data) {
+      continue;
+    }
+    auto parser = GARROW_TIMESTAMP_PARSER(node->data);
+    g_object_ref(parser);
+    priv->timestamp_parsers = g_list_prepend(priv->timestamp_parsers, parser);
+    priv->convert_options.timestamp_parsers.push_back(
+      garrow_timestamp_parser_get_raw(parser));
+  }
+  priv->timestamp_parsers = g_list_reverse(priv->timestamp_parsers);
+}
+
+/**
+ * garrow_csv_read_options_get_timestamp_parsers:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Returns: (element-type GArrowTimestampParser) (transfer none):
+ *
+ *   The list of #GArrowTimestampParsers to be used.
+ *
+ * Since: 16.0.0
+ */
+GList *
+garrow_csv_read_options_get_timestamp_parsers(GArrowCSVReadOptions *options)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  return priv->timestamp_parsers;
+}
+
+/**
+ * garrow_csv_read_options_add_timestamp_parser:
+ * @options: A #GArrowCSVReadOptions.
+ * @parser: The #GArrowTimestampParser to be added.
+ *
+ * Since: 16.0.0
+ */
+void
+garrow_csv_read_options_add_timestamp_parser(GArrowCSVReadOptions *options,
+                                             GArrowTimestampParser *parser)
+{
+  auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+  if (parser) {
+    g_object_ref(parser);
+    priv->timestamp_parsers = g_list_append(priv->timestamp_parsers, parser);
+    priv->convert_options.timestamp_parsers.push_back(
+      garrow_timestamp_parser_get_raw(parser));
+  }
+}
+
 typedef struct GArrowCSVReaderPrivate_
 {
   std::shared_ptr<arrow::csv::TableReader> reader;
diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h
index 08faf86cd0..96e4c5bbb5 100644
--- a/c_glib/arrow-glib/reader.h
+++ b/c_glib/arrow-glib/reader.h
@@ -19,13 +19,12 @@
 
 #pragma once
 
+#include <arrow-glib/input-stream.h>
+#include <arrow-glib/metadata-version.h>
 #include <arrow-glib/record-batch.h>
 #include <arrow-glib/schema.h>
 #include <arrow-glib/table.h>
-
-#include <arrow-glib/input-stream.h>
-
-#include <arrow-glib/metadata-version.h>
+#include <arrow-glib/timestamp-parser.h>
 
 G_BEGIN_DECLS
 
@@ -239,6 +238,17 @@ GARROW_AVAILABLE_IN_0_15
 void
 garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options,
                                         const gchar *column_name);
+GARROW_AVAILABLE_IN_16_0
+void
+garrow_csv_read_options_set_timestamp_parsers(GArrowCSVReadOptions *options,
+                                              GList *parsers);
+GARROW_AVAILABLE_IN_16_0
+GList *
+garrow_csv_read_options_get_timestamp_parsers(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_16_0
+void
+garrow_csv_read_options_add_timestamp_parser(GArrowCSVReadOptions *options,
+                                             GArrowTimestampParser *parser);
 
 #define GARROW_TYPE_CSV_READER (garrow_csv_reader_get_type())
 G_DECLARE_DERIVABLE_TYPE(GArrowCSVReader, garrow_csv_reader, GARROW, 
CSV_READER, GObject)
diff --git a/c_glib/test/test-csv-reader.rb b/c_glib/test/test-csv-reader.rb
index 018f062ac3..cc102553b1 100644
--- a/c_glib/test/test-csv-reader.rb
+++ b/c_glib/test/test-csv-reader.rb
@@ -236,6 +236,21 @@ message1,message2
         assert_equal(build_table(columns),
                      table.read)
       end
+
+      def test_timestamp_parsers
+        options = Arrow::CSVReadOptions.new
+        assert_equal([], options.timestamp_parsers)
+
+        iso8601_timestamp_parser = Arrow::ISO8601TimestampParser.new
+        options.timestamp_parsers = [iso8601_timestamp_parser]
+        assert_equal([iso8601_timestamp_parser],
+                     options.timestamp_parsers)
+
+        date_timestamp_parser = Arrow::StrptimeTimestampParser.new("%Y-%m-%d")
+        options.add_timestamp_parser(date_timestamp_parser)
+        assert_equal([iso8601_timestamp_parser, date_timestamp_parser],
+                     options.timestamp_parsers)
+      end
     end
   end
 end
diff --git a/ruby/red-arrow/lib/arrow/loader.rb 
b/ruby/red-arrow/lib/arrow/loader.rb
index 9d1432bbfb..bd0d039308 100644
--- a/ruby/red-arrow/lib/arrow/loader.rb
+++ b/ruby/red-arrow/lib/arrow/loader.rb
@@ -138,6 +138,7 @@ module Arrow
       require "arrow/timestamp-array"
       require "arrow/timestamp-array-builder"
       require "arrow/timestamp-data-type"
+      require "arrow/timestamp-parser"
       require "arrow/union-array-builder"
       require "arrow/writable"
     end
diff --git a/ruby/red-arrow/lib/arrow/timestamp-parser.rb 
b/ruby/red-arrow/lib/arrow/timestamp-parser.rb
new file mode 100644
index 0000000000..d50ac5846e
--- /dev/null
+++ b/ruby/red-arrow/lib/arrow/timestamp-parser.rb
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+  class TimestampParser
+    class << self
+      def try_convert(value)
+        case value
+        when :iso8601
+          ISO8601TimestampParser.new
+        when String
+          StrptimeTimestampParser.new(value)
+        else
+          nil
+        end
+      end
+    end
+  end
+end
diff --git a/ruby/red-arrow/test/test-csv-loader.rb 
b/ruby/red-arrow/test/test-csv-loader.rb
index 7f7f23498d..72bae2fcab 100644
--- a/ruby/red-arrow/test/test-csv-loader.rb
+++ b/ruby/red-arrow/test/test-csv-loader.rb
@@ -246,5 +246,42 @@ count
                             encoding: encoding,
                             compression: :gzip))
     end
+
+    sub_test_case(":timestamp_parsers") do
+      test(":iso8601") do
+        data_type = Arrow::TimestampDataType.new(:second,
+                                                 GLib::TimeZone.new("UTC"))
+        timestamps = [
+          Time.iso8601("2024-03-16T23:54:12Z"),
+          Time.iso8601("2024-03-16T23:54:13Z"),
+          Time.iso8601("2024-03-16T23:54:14Z"),
+        ]
+        values = Arrow::TimestampArray.new(data_type, timestamps)
+        assert_equal(Arrow::Table.new(value: values),
+                     load_csv(<<-CSV, headers: true, timestamp_parsers: 
[:iso8601]))
+value
+#{timestamps[0].iso8601}
+#{timestamps[1].iso8601}
+#{timestamps[2].iso8601}
+                     CSV
+      end
+
+      test("String") do
+        timestamps = [
+          Time.iso8601("2024-03-16T23:54:12Z"),
+          Time.iso8601("2024-03-16T23:54:13Z"),
+          Time.iso8601("2024-03-16T23:54:14Z"),
+        ]
+        values = Arrow::TimestampArray.new(:second, timestamps)
+        format = "%Y-%m-%dT%H:%M:%S"
+        assert_equal(Arrow::Table.new(value: values).schema,
+                     load_csv(<<-CSV, headers: true, timestamp_parsers: 
[format]).schema)
+value
+#{timestamps[0].iso8601.chomp("Z")}
+#{timestamps[1].iso8601.chomp("Z")}
+#{timestamps[2].iso8601.chomp("Z")}
+                     CSV
+      end
+    end
   end
 end

Reply via email to