This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 7d3f7b3f8b GH-40573: [GLib][Ruby][CSV] Add support for customizing
timestamp parsers (#40590)
7d3f7b3f8b is described below
commit 7d3f7b3f8b22cd7fbd3a69d6dcab7716dee79202
Author: Sutou Kouhei <[email protected]>
AuthorDate: Mon Mar 18 15:22:46 2024 +0900
GH-40573: [GLib][Ruby][CSV] Add support for customizing timestamp parsers
(#40590)
### Rationale for this change
ISO8601 timestamp values in CSV can be parsed by default but non-ISO8601
timestamp values can't.
### What changes are included in this PR?
* Add `garrow_csv_read_options_set_timestamp_parsers()`
* Add `garrow_csv_read_options_get_timestamp_parsers()`
* Add `garrow_csv_read_options_add_timestamp_parser()`
* Add `Arrow::TimestampParser.try_convert` for implicit cast
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes.
* GitHub Issue: #40573
Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
c_glib/arrow-glib/reader.cpp | 83 +++++++++++++++++++++++++++-
c_glib/arrow-glib/reader.h | 18 ++++--
c_glib/test/test-csv-reader.rb | 15 +++++
ruby/red-arrow/lib/arrow/loader.rb | 1 +
ruby/red-arrow/lib/arrow/timestamp-parser.rb | 33 +++++++++++
ruby/red-arrow/test/test-csv-loader.rb | 37 +++++++++++++
6 files changed, 181 insertions(+), 6 deletions(-)
diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index 0b388a4ba3..8a1c3722d4 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -29,6 +29,7 @@
#include <arrow-glib/record-batch.hpp>
#include <arrow-glib/schema.hpp>
#include <arrow-glib/table.hpp>
+#include <arrow-glib/timestamp-parser.hpp>
#include <arrow/c/bridge.h>
@@ -872,12 +873,13 @@
garrow_feather_file_reader_read_names(GArrowFeatherFileReader *reader,
}
}
-typedef struct GArrowCSVReadOptionsPrivate_
+struct GArrowCSVReadOptionsPrivate
{
arrow::csv::ReadOptions read_options;
arrow::csv::ParseOptions parse_options;
arrow::csv::ConvertOptions convert_options;
-} GArrowCSVReadOptionsPrivate;
+ GList *timestamp_parsers;
+};
enum {
PROP_USE_THREADS = 1,
@@ -902,6 +904,17 @@ G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVReadOptions,
garrow_csv_read_options, G_TYPE
static_cast<GArrowCSVReadOptionsPrivate *>(
\
garrow_csv_read_options_get_instance_private(GARROW_CSV_READ_OPTIONS(object)))
+static void
+garrow_csv_read_options_dispose(GObject *object)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(object);
+
+ g_list_free_full(priv->timestamp_parsers, g_object_unref);
+ priv->timestamp_parsers = nullptr;
+
+ G_OBJECT_CLASS(garrow_csv_read_options_parent_class)->dispose(object);
+}
+
static void
garrow_csv_read_options_set_property(GObject *object,
guint prop_id,
@@ -1032,6 +1045,7 @@
garrow_csv_read_options_class_init(GArrowCSVReadOptionsClass *klass)
auto gobject_class = G_OBJECT_CLASS(klass);
+ gobject_class->dispose = garrow_csv_read_options_dispose;
gobject_class->set_property = garrow_csv_read_options_set_property;
gobject_class->get_property = garrow_csv_read_options_get_property;
@@ -1623,6 +1637,71 @@
garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options,
priv->read_options.column_names.push_back(column_name);
}
+/**
+ * garrow_csv_read_options_set_timestamp_parsers:
+ * @options: A #GArrowCSVReadOptions.
+ * @parsers: (element-type GArrowTimestampParser): The list of
+ * #GArrowTimestampParser to be added.
+ *
+ * Since: 16.0.0
+ */
+void
+garrow_csv_read_options_set_timestamp_parsers(GArrowCSVReadOptions *options,
+ GList *parsers)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ g_list_free_full(priv->timestamp_parsers, g_object_unref);
+ priv->convert_options.timestamp_parsers.clear();
+ for (auto node = parsers; node; node = g_list_next(node)) {
+ if (!node->data) {
+ continue;
+ }
+ auto parser = GARROW_TIMESTAMP_PARSER(node->data);
+ g_object_ref(parser);
+ priv->timestamp_parsers = g_list_prepend(priv->timestamp_parsers, parser);
+ priv->convert_options.timestamp_parsers.push_back(
+ garrow_timestamp_parser_get_raw(parser));
+ }
+ priv->timestamp_parsers = g_list_reverse(priv->timestamp_parsers);
+}
+
+/**
+ * garrow_csv_read_options_get_timestamp_parsers:
+ * @options: A #GArrowCSVReadOptions.
+ *
+ * Returns: (element-type GArrowTimestampParser) (transfer none):
+ *
+ * The list of #GArrowTimestampParsers to be used.
+ *
+ * Since: 16.0.0
+ */
+GList *
+garrow_csv_read_options_get_timestamp_parsers(GArrowCSVReadOptions *options)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ return priv->timestamp_parsers;
+}
+
+/**
+ * garrow_csv_read_options_add_timestamp_parser:
+ * @options: A #GArrowCSVReadOptions.
+ * @parser: The #GArrowTimestampParser to be added.
+ *
+ * Since: 16.0.0
+ */
+void
+garrow_csv_read_options_add_timestamp_parser(GArrowCSVReadOptions *options,
+ GArrowTimestampParser *parser)
+{
+ auto priv = GARROW_CSV_READ_OPTIONS_GET_PRIVATE(options);
+ if (parser) {
+ g_object_ref(parser);
+ priv->timestamp_parsers = g_list_append(priv->timestamp_parsers, parser);
+ priv->convert_options.timestamp_parsers.push_back(
+ garrow_timestamp_parser_get_raw(parser));
+ }
+}
+
typedef struct GArrowCSVReaderPrivate_
{
std::shared_ptr<arrow::csv::TableReader> reader;
diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h
index 08faf86cd0..96e4c5bbb5 100644
--- a/c_glib/arrow-glib/reader.h
+++ b/c_glib/arrow-glib/reader.h
@@ -19,13 +19,12 @@
#pragma once
+#include <arrow-glib/input-stream.h>
+#include <arrow-glib/metadata-version.h>
#include <arrow-glib/record-batch.h>
#include <arrow-glib/schema.h>
#include <arrow-glib/table.h>
-
-#include <arrow-glib/input-stream.h>
-
-#include <arrow-glib/metadata-version.h>
+#include <arrow-glib/timestamp-parser.h>
G_BEGIN_DECLS
@@ -239,6 +238,17 @@ GARROW_AVAILABLE_IN_0_15
void
garrow_csv_read_options_add_column_name(GArrowCSVReadOptions *options,
const gchar *column_name);
+GARROW_AVAILABLE_IN_16_0
+void
+garrow_csv_read_options_set_timestamp_parsers(GArrowCSVReadOptions *options,
+ GList *parsers);
+GARROW_AVAILABLE_IN_16_0
+GList *
+garrow_csv_read_options_get_timestamp_parsers(GArrowCSVReadOptions *options);
+GARROW_AVAILABLE_IN_16_0
+void
+garrow_csv_read_options_add_timestamp_parser(GArrowCSVReadOptions *options,
+ GArrowTimestampParser *parser);
#define GARROW_TYPE_CSV_READER (garrow_csv_reader_get_type())
G_DECLARE_DERIVABLE_TYPE(GArrowCSVReader, garrow_csv_reader, GARROW,
CSV_READER, GObject)
diff --git a/c_glib/test/test-csv-reader.rb b/c_glib/test/test-csv-reader.rb
index 018f062ac3..cc102553b1 100644
--- a/c_glib/test/test-csv-reader.rb
+++ b/c_glib/test/test-csv-reader.rb
@@ -236,6 +236,21 @@ message1,message2
assert_equal(build_table(columns),
table.read)
end
+
+ def test_timestamp_parsers
+ options = Arrow::CSVReadOptions.new
+ assert_equal([], options.timestamp_parsers)
+
+ iso8601_timestamp_parser = Arrow::ISO8601TimestampParser.new
+ options.timestamp_parsers = [iso8601_timestamp_parser]
+ assert_equal([iso8601_timestamp_parser],
+ options.timestamp_parsers)
+
+ date_timestamp_parser = Arrow::StrptimeTimestampParser.new("%Y-%m-%d")
+ options.add_timestamp_parser(date_timestamp_parser)
+ assert_equal([iso8601_timestamp_parser, date_timestamp_parser],
+ options.timestamp_parsers)
+ end
end
end
end
diff --git a/ruby/red-arrow/lib/arrow/loader.rb
b/ruby/red-arrow/lib/arrow/loader.rb
index 9d1432bbfb..bd0d039308 100644
--- a/ruby/red-arrow/lib/arrow/loader.rb
+++ b/ruby/red-arrow/lib/arrow/loader.rb
@@ -138,6 +138,7 @@ module Arrow
require "arrow/timestamp-array"
require "arrow/timestamp-array-builder"
require "arrow/timestamp-data-type"
+ require "arrow/timestamp-parser"
require "arrow/union-array-builder"
require "arrow/writable"
end
diff --git a/ruby/red-arrow/lib/arrow/timestamp-parser.rb
b/ruby/red-arrow/lib/arrow/timestamp-parser.rb
new file mode 100644
index 0000000000..d50ac5846e
--- /dev/null
+++ b/ruby/red-arrow/lib/arrow/timestamp-parser.rb
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class TimestampParser
+ class << self
+ def try_convert(value)
+ case value
+ when :iso8601
+ ISO8601TimestampParser.new
+ when String
+ StrptimeTimestampParser.new(value)
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/ruby/red-arrow/test/test-csv-loader.rb
b/ruby/red-arrow/test/test-csv-loader.rb
index 7f7f23498d..72bae2fcab 100644
--- a/ruby/red-arrow/test/test-csv-loader.rb
+++ b/ruby/red-arrow/test/test-csv-loader.rb
@@ -246,5 +246,42 @@ count
encoding: encoding,
compression: :gzip))
end
+
+ sub_test_case(":timestamp_parsers") do
+ test(":iso8601") do
+ data_type = Arrow::TimestampDataType.new(:second,
+ GLib::TimeZone.new("UTC"))
+ timestamps = [
+ Time.iso8601("2024-03-16T23:54:12Z"),
+ Time.iso8601("2024-03-16T23:54:13Z"),
+ Time.iso8601("2024-03-16T23:54:14Z"),
+ ]
+ values = Arrow::TimestampArray.new(data_type, timestamps)
+ assert_equal(Arrow::Table.new(value: values),
+ load_csv(<<-CSV, headers: true, timestamp_parsers:
[:iso8601]))
+value
+#{timestamps[0].iso8601}
+#{timestamps[1].iso8601}
+#{timestamps[2].iso8601}
+ CSV
+ end
+
+ test("String") do
+ timestamps = [
+ Time.iso8601("2024-03-16T23:54:12Z"),
+ Time.iso8601("2024-03-16T23:54:13Z"),
+ Time.iso8601("2024-03-16T23:54:14Z"),
+ ]
+ values = Arrow::TimestampArray.new(:second, timestamps)
+ format = "%Y-%m-%dT%H:%M:%S"
+ assert_equal(Arrow::Table.new(value: values).schema,
+ load_csv(<<-CSV, headers: true, timestamp_parsers:
[format]).schema)
+value
+#{timestamps[0].iso8601.chomp("Z")}
+#{timestamps[1].iso8601.chomp("Z")}
+#{timestamps[2].iso8601.chomp("Z")}
+ CSV
+ end
+ end
end
end