kou commented on code in PR #48681: URL: https://github.com/apache/arrow/pull/48681#discussion_r2655096759
########## c_glib/test/test-csv-writer.rb: ########## @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestCSVWriter < Test::Unit::TestCase + include Helper::Buildable + + def test_write_record_batch + message_data = ["Start", "Shutdown"] + count_data = [2, 9] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + record_batch = Arrow::RecordBatch.new(schema, + message_data.size, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_record_batch(record_batch) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + CSV + assert_equal(expected, csv_output) + end + + def test_write_table + message_data = ["Start", "Shutdown", "Reboot"] + count_data = [2, 9, 5] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) Review Comment: ```suggestion csv_writer = Arrow::CSVWriter.new(output, schema) ``` ########## c_glib/test/test-csv-writer.rb: ########## @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestCSVWriter < Test::Unit::TestCase + include Helper::Buildable + + def test_write_record_batch + message_data = ["Start", "Shutdown"] + count_data = [2, 9] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + record_batch = Arrow::RecordBatch.new(schema, + message_data.size, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_record_batch(record_batch) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + CSV + assert_equal(expected, csv_output) + end + + def test_write_table + message_data = ["Start", "Shutdown", "Reboot"] + count_data = [2, 9, 5] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + table = Arrow::Table.new(schema, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_table(table) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + "Reboot",5 + CSV + assert_equal(expected, csv_output) + end + + + sub_test_case("options") do + def setup + @options = Arrow::CSVWriteOptions.new + end + + def test_include_header + assert do + @options.include_header? + end + @options.include_header = false + assert do + not @options.include_header? + end + end + + def test_batch_size + assert_equal(1024, @options.batch_size) + @options.batch_size = 2048 + assert_equal(2048, @options.batch_size) + end + + def test_delimiter + assert_equal(44, @options.delimiter) # 44 is the ASCII code for comma + @options.delimiter = ";".ord + assert_equal(59, @options.delimiter) # 59 is the ASCII code for semicolon + end + + def test_null_string + assert_equal("", @options.null_string) + @options.null_string = "NULL" + assert_equal("NULL", @options.null_string) + end + + def test_eol + assert_equal("\n", @options.eol) + @options.eol = "\r\n" + assert_equal("\r\n", @options.eol) + end + + def test_quoting_style + assert_equal(Arrow::CSVQuotingStyle::NEEDED, @options.quoting_style) + @options.quoting_style = Arrow::CSVQuotingStyle::ALL_VALID + assert_equal(Arrow::CSVQuotingStyle::ALL_VALID, @options.quoting_style) + end + + def test_quoting_header + assert_equal(Arrow::CSVQuotingStyle::NEEDED, @options.quoting_header) + @options.quoting_header = Arrow::CSVQuotingStyle::NONE + assert_equal(Arrow::CSVQuotingStyle::NONE, @options.quoting_header) + end + + def test_write_with_options + message_data = ["Start", nil, "Reboot"] + count_data = [2, 9, 5] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + options = Arrow::CSVWriteOptions.new + options.include_header = false + options.delimiter = ";".ord + options.quoting_style = Arrow::CSVQuotingStyle::NONE + options.null_string = "NULL" Review Comment: We can use ```ruby options = { include_header: false, delimiter: ";", quoting_style: :none, null_string: "NULL, } ``` by defining `Arrow::CSVWriteOptions.try_convert`: ```ruby module Arrow class CSVWriteOptions class << self def try_convert(value) case value when Hash options = new value.each do |k, v| options.public_send("#{k}=", value) end options else nil end end end end end ``` ########## c_glib/arrow-glib/writer.cpp: ########## @@ -300,6 +301,320 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink, } } +struct GArrowCSVWriteOptionsPrivate +{ + arrow::csv::WriteOptions write_options; +}; + +enum { + PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER = 1, + PROP_CSV_WRITE_OPTIONS_BATCH_SIZE, + PROP_CSV_WRITE_OPTIONS_DELIMITER, + PROP_CSV_WRITE_OPTIONS_NULL_STRING, + PROP_CSV_WRITE_OPTIONS_EOL, + PROP_CSV_WRITE_OPTIONS_QUOTING_STYLE, + PROP_CSV_WRITE_OPTIONS_QUOTING_HEADER, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVWriteOptions, garrow_csv_write_options, G_TYPE_OBJECT) + +#define GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object) \ + static_cast<GArrowCSVWriteOptionsPrivate *>( \ + garrow_csv_write_options_get_instance_private(GARROW_CSV_WRITE_OPTIONS(object))) + +static void +garrow_csv_write_options_finalize(GObject *object) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object); + + priv->write_options.~WriteOptions(); + + G_OBJECT_CLASS(garrow_csv_write_options_parent_class)->finalize(object); +} + +static void +garrow_csv_write_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER: + priv->write_options.include_header = g_value_get_boolean(value); + break; + case PROP_CSV_WRITE_OPTIONS_BATCH_SIZE: + priv->write_options.batch_size = g_value_get_int(value); + break; + case PROP_CSV_WRITE_OPTIONS_DELIMITER: + priv->write_options.delimiter = g_value_get_schar(value); + break; + case PROP_CSV_WRITE_OPTIONS_NULL_STRING: + priv->write_options.null_string = g_value_get_string(value); + break; + case PROP_CSV_WRITE_OPTIONS_EOL: + priv->write_options.eol = g_value_get_string(value); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_STYLE: + priv->write_options.quoting_style = + static_cast<arrow::csv::QuotingStyle>(g_value_get_enum(value)); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_HEADER: + priv->write_options.quoting_header = + static_cast<arrow::csv::QuotingStyle>(g_value_get_enum(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_csv_write_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER: + g_value_set_boolean(value, priv->write_options.include_header); + break; + case PROP_CSV_WRITE_OPTIONS_BATCH_SIZE: + g_value_set_int(value, priv->write_options.batch_size); + break; + case PROP_CSV_WRITE_OPTIONS_DELIMITER: + g_value_set_schar(value, priv->write_options.delimiter); + break; + case PROP_CSV_WRITE_OPTIONS_NULL_STRING: + g_value_set_string(value, priv->write_options.null_string.c_str()); + break; + case PROP_CSV_WRITE_OPTIONS_EOL: + g_value_set_string(value, priv->write_options.eol.c_str()); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_STYLE: + g_value_set_enum( + value, + static_cast<GArrowCSVQuotingStyle>(priv->write_options.quoting_style)); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_HEADER: + g_value_set_enum( + value, + static_cast<GArrowCSVQuotingStyle>(priv->write_options.quoting_header)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_csv_write_options_init(GArrowCSVWriteOptions *object) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object); + new (&priv->write_options) arrow::csv::WriteOptions; + priv->write_options = arrow::csv::WriteOptions::Defaults(); +} + +static void +garrow_csv_write_options_class_init(GArrowCSVWriteOptionsClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_csv_write_options_finalize; + gobject_class->set_property = garrow_csv_write_options_set_property; + gobject_class->get_property = garrow_csv_write_options_get_property; + + auto write_options = arrow::csv::WriteOptions::Defaults(); + + /** + * GArrowCSVWriteOptions:include-header: + * + * Whether to write an initial header line with column names. + * + * Since: 23.0.0 + */ + spec = g_param_spec_boolean("include-header", + "Include header", + "Whether to write an initial header line with column names", + write_options.include_header, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER, + spec); + + /** + * GArrowCSVWriteOptions:batch-size: + * + * Maximum number of rows processed at a time. + * + * The CSV writer converts and writes data in batches of N rows. This number can impact + * performance. + * + * Since: 23.0.0 + */ + spec = g_param_spec_int("batch-size", + "Batch size", + "Maximum number of rows processed at a time", + 1, + G_MAXINT32, + write_options.batch_size, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_CSV_WRITE_OPTIONS_BATCH_SIZE, spec); + + /** + * GArrowCSVWriteOptions:delimiter: + * + * Field delimiter. + * + * Since: 23.0.0 + */ + spec = g_param_spec_char("delimiter", + "Delimiter", + "Field delimiter", + 0, + G_MAXINT8, + write_options.delimiter, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_CSV_WRITE_OPTIONS_DELIMITER, spec); + + /** + * GArrowCSVWriteOptions:null-string: + * + * The string to write for null values. Quotes are not allowed in this string. + * + * Since: 23.0.0 + */ + spec = g_param_spec_string("null-string", + "Null string", + "The string to write for null values", + "", Review Comment: ```suggestion write_options.null_string.c_str(), ``` ########## c_glib/test/test-csv-writer.rb: ########## @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestCSVWriter < Test::Unit::TestCase + include Helper::Buildable + + def test_write_record_batch + message_data = ["Start", "Shutdown"] + count_data = [2, 9] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + record_batch = Arrow::RecordBatch.new(schema, + message_data.size, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_record_batch(record_batch) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + CSV + assert_equal(expected, csv_output) + end + + def test_write_table + message_data = ["Start", "Shutdown", "Reboot"] + count_data = [2, 9, 5] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + table = Arrow::Table.new(schema, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_table(table) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + "Reboot",5 + CSV + assert_equal(expected, csv_output) + end + + + sub_test_case("options") do + def setup + @options = Arrow::CSVWriteOptions.new + end + + def test_include_header + assert do + @options.include_header? + end + @options.include_header = false + assert do + not @options.include_header? + end + end + + def test_batch_size + assert_equal(1024, @options.batch_size) + @options.batch_size = 2048 + assert_equal(2048, @options.batch_size) + end + + def test_delimiter + assert_equal(44, @options.delimiter) # 44 is the ASCII code for comma + @options.delimiter = ";".ord Review Comment: We can use `String`, right? ```suggestion @options.delimiter = ";" ``` ########## c_glib/test/test-csv-writer.rb: ########## @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestCSVWriter < Test::Unit::TestCase + include Helper::Buildable + + def test_write_record_batch + message_data = ["Start", "Shutdown"] + count_data = [2, 9] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) Review Comment: ```suggestion csv_writer = Arrow::CSVWriter.new(output, schema) ``` ########## c_glib/test/test-csv-writer.rb: ########## @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestCSVWriter < Test::Unit::TestCase + include Helper::Buildable + + def test_write_record_batch + message_data = ["Start", "Shutdown"] + count_data = [2, 9] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + record_batch = Arrow::RecordBatch.new(schema, + message_data.size, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_record_batch(record_batch) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + CSV + assert_equal(expected, csv_output) + end + + def test_write_table + message_data = ["Start", "Shutdown", "Reboot"] + count_data = [2, 9, 5] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + table = Arrow::Table.new(schema, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_table(table) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + "Reboot",5 + CSV + assert_equal(expected, csv_output) + end + + + sub_test_case("options") do + def setup + @options = Arrow::CSVWriteOptions.new + end + + def test_include_header + assert do + @options.include_header? + end + @options.include_header = false + assert do + not @options.include_header? + end + end + + def test_batch_size + assert_equal(1024, @options.batch_size) + @options.batch_size = 2048 + assert_equal(2048, @options.batch_size) + end + + def test_delimiter + assert_equal(44, @options.delimiter) # 44 is the ASCII code for comma Review Comment: How about using descriptive code instead of comment? ```suggestion assert_equal(".".ord, @options.delimiter) ``` ########## c_glib/test/test-csv-writer.rb: ########## @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestCSVWriter < Test::Unit::TestCase + include Helper::Buildable + + def test_write_record_batch + message_data = ["Start", "Shutdown"] + count_data = [2, 9] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + record_batch = Arrow::RecordBatch.new(schema, + message_data.size, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_record_batch(record_batch) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + CSV + assert_equal(expected, csv_output) + end + + def test_write_table + message_data = ["Start", "Shutdown", "Reboot"] + count_data = [2, 9, 5] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + table = Arrow::Table.new(schema, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_table(table) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + "Reboot",5 + CSV + assert_equal(expected, csv_output) + end + + + sub_test_case("options") do + def setup + @options = Arrow::CSVWriteOptions.new + end + + def test_include_header + assert do + @options.include_header? + end + @options.include_header = false + assert do + not @options.include_header? + end + end + + def test_batch_size + assert_equal(1024, @options.batch_size) + @options.batch_size = 2048 + assert_equal(2048, @options.batch_size) + end + + def test_delimiter + assert_equal(44, @options.delimiter) # 44 is the ASCII code for comma + @options.delimiter = ";".ord + assert_equal(59, @options.delimiter) # 59 is the ASCII code for semicolon + end + + def test_null_string + assert_equal("", @options.null_string) + @options.null_string = "NULL" + assert_equal("NULL", @options.null_string) + end + + def test_eol + assert_equal("\n", @options.eol) + @options.eol = "\r\n" + assert_equal("\r\n", @options.eol) + end + + def test_quoting_style + assert_equal(Arrow::CSVQuotingStyle::NEEDED, @options.quoting_style) + @options.quoting_style = Arrow::CSVQuotingStyle::ALL_VALID Review Comment: ```suggestion @options.quoting_style = :all_valid ``` ########## c_glib/test/test-csv-writer.rb: ########## @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestCSVWriter < Test::Unit::TestCase + include Helper::Buildable + + def test_write_record_batch + message_data = ["Start", "Shutdown"] + count_data = [2, 9] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + record_batch = Arrow::RecordBatch.new(schema, + message_data.size, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_record_batch(record_batch) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + CSV + assert_equal(expected, csv_output) + end + + def test_write_table + message_data = ["Start", "Shutdown", "Reboot"] + count_data = [2, 9, 5] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + table = Arrow::Table.new(schema, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_table(table) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + "Reboot",5 + CSV + assert_equal(expected, csv_output) + end + + + sub_test_case("options") do + def setup + @options = Arrow::CSVWriteOptions.new + end + + def test_include_header + assert do + @options.include_header? + end + @options.include_header = false + assert do + not @options.include_header? + end + end + + def test_batch_size + assert_equal(1024, @options.batch_size) + @options.batch_size = 2048 + assert_equal(2048, @options.batch_size) + end + + def test_delimiter + assert_equal(44, @options.delimiter) # 44 is the ASCII code for comma + @options.delimiter = ";".ord + assert_equal(59, @options.delimiter) # 59 is the ASCII code for semicolon + end + + def test_null_string + assert_equal("", @options.null_string) + @options.null_string = "NULL" + assert_equal("NULL", @options.null_string) + end + + def test_eol + assert_equal("\n", @options.eol) + @options.eol = "\r\n" + assert_equal("\r\n", @options.eol) + end + + def test_quoting_style + assert_equal(Arrow::CSVQuotingStyle::NEEDED, @options.quoting_style) + @options.quoting_style = Arrow::CSVQuotingStyle::ALL_VALID + assert_equal(Arrow::CSVQuotingStyle::ALL_VALID, @options.quoting_style) + end + + def test_quoting_header + assert_equal(Arrow::CSVQuotingStyle::NEEDED, @options.quoting_header) + @options.quoting_header = Arrow::CSVQuotingStyle::NONE Review Comment: ```suggestion @options.quoting_header = :none ``` ########## c_glib/arrow-glib/writer.cpp: ########## @@ -300,6 +301,320 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink, } } +struct GArrowCSVWriteOptionsPrivate +{ + arrow::csv::WriteOptions write_options; +}; + +enum { + PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER = 1, + PROP_CSV_WRITE_OPTIONS_BATCH_SIZE, + PROP_CSV_WRITE_OPTIONS_DELIMITER, + PROP_CSV_WRITE_OPTIONS_NULL_STRING, + PROP_CSV_WRITE_OPTIONS_EOL, + PROP_CSV_WRITE_OPTIONS_QUOTING_STYLE, + PROP_CSV_WRITE_OPTIONS_QUOTING_HEADER, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVWriteOptions, garrow_csv_write_options, G_TYPE_OBJECT) + +#define GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object) \ + static_cast<GArrowCSVWriteOptionsPrivate *>( \ + garrow_csv_write_options_get_instance_private(GARROW_CSV_WRITE_OPTIONS(object))) + +static void +garrow_csv_write_options_finalize(GObject *object) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object); + + priv->write_options.~WriteOptions(); + + G_OBJECT_CLASS(garrow_csv_write_options_parent_class)->finalize(object); +} + +static void +garrow_csv_write_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER: + priv->write_options.include_header = g_value_get_boolean(value); + break; + case PROP_CSV_WRITE_OPTIONS_BATCH_SIZE: + priv->write_options.batch_size = g_value_get_int(value); + break; + case PROP_CSV_WRITE_OPTIONS_DELIMITER: + priv->write_options.delimiter = g_value_get_schar(value); + break; + case PROP_CSV_WRITE_OPTIONS_NULL_STRING: + priv->write_options.null_string = g_value_get_string(value); + break; + case PROP_CSV_WRITE_OPTIONS_EOL: + priv->write_options.eol = g_value_get_string(value); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_STYLE: + priv->write_options.quoting_style = + static_cast<arrow::csv::QuotingStyle>(g_value_get_enum(value)); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_HEADER: + priv->write_options.quoting_header = + static_cast<arrow::csv::QuotingStyle>(g_value_get_enum(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_csv_write_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER: + g_value_set_boolean(value, priv->write_options.include_header); + break; + case PROP_CSV_WRITE_OPTIONS_BATCH_SIZE: + g_value_set_int(value, priv->write_options.batch_size); + break; + case PROP_CSV_WRITE_OPTIONS_DELIMITER: + g_value_set_schar(value, priv->write_options.delimiter); + break; + case PROP_CSV_WRITE_OPTIONS_NULL_STRING: + g_value_set_string(value, priv->write_options.null_string.c_str()); + break; + case PROP_CSV_WRITE_OPTIONS_EOL: + g_value_set_string(value, priv->write_options.eol.c_str()); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_STYLE: + g_value_set_enum( + value, + static_cast<GArrowCSVQuotingStyle>(priv->write_options.quoting_style)); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_HEADER: + g_value_set_enum( + value, + static_cast<GArrowCSVQuotingStyle>(priv->write_options.quoting_header)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_csv_write_options_init(GArrowCSVWriteOptions *object) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object); + new (&priv->write_options) arrow::csv::WriteOptions; + priv->write_options = arrow::csv::WriteOptions::Defaults(); +} + +static void +garrow_csv_write_options_class_init(GArrowCSVWriteOptionsClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_csv_write_options_finalize; + gobject_class->set_property = garrow_csv_write_options_set_property; + gobject_class->get_property = garrow_csv_write_options_get_property; + + auto write_options = arrow::csv::WriteOptions::Defaults(); + + /** + * GArrowCSVWriteOptions:include-header: + * + * Whether to write an initial header line with column names. + * + * Since: 23.0.0 + */ + spec = g_param_spec_boolean("include-header", + "Include header", + "Whether to write an initial header line with column names", + write_options.include_header, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER, + spec); + + /** + * GArrowCSVWriteOptions:batch-size: + * + * Maximum number of rows processed at a time. + * + * The CSV writer converts and writes data in batches of N rows. This number can impact + * performance. + * + * Since: 23.0.0 + */ + spec = g_param_spec_int("batch-size", + "Batch size", + "Maximum number of rows processed at a time", + 1, + G_MAXINT32, + write_options.batch_size, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_CSV_WRITE_OPTIONS_BATCH_SIZE, spec); + + /** + * GArrowCSVWriteOptions:delimiter: + * + * Field delimiter. + * + * Since: 23.0.0 + */ + spec = g_param_spec_char("delimiter", + "Delimiter", + "Field delimiter", + 0, + G_MAXINT8, + write_options.delimiter, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_CSV_WRITE_OPTIONS_DELIMITER, spec); + + /** + * GArrowCSVWriteOptions:null-string: + * + * The string to write for null values. Quotes are not allowed in this string. + * + * Since: 23.0.0 + */ + spec = g_param_spec_string("null-string", + "Null string", + "The string to write for null values", + "", + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_CSV_WRITE_OPTIONS_NULL_STRING, + spec); + + /** + * GArrowCSVWriteOptions:eol: + * + * The end of line character to use for ending rows. + * + * Since: 23.0.0 + */ + spec = g_param_spec_string("eol", + "EOL", + "The end of line character to use for ending rows", + "\n", Review Comment: ```suggestion write_options.eol.c_str(), ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
