lidavidm commented on code in PR #43482:
URL: https://github.com/apache/arrow/pull/43482#discussion_r1701060717


##########
java/dataset/src/main/cpp/jni_wrapper.cc:
##########
@@ -368,28 +368,105 @@ std::shared_ptr<arrow::Buffer> 
LoadArrowBufferFromByteBuffer(JNIEnv* env, jobjec
 
 inline bool ParseBool(const std::string& value) { return value == "true" ? 
true : false; }
 
+inline bool ParseChar(const std::string& value) {
+  if (value.size() != 1) {
+    JniThrow("Csv convert option " + value + " should be a char");

Review Comment:
   ```suggestion
       JniThrow("CSV convert option " + value + " should be a char");
   ```



##########
java/dataset/src/main/cpp/jni_wrapper.cc:
##########
@@ -368,28 +368,105 @@ std::shared_ptr<arrow::Buffer> 
LoadArrowBufferFromByteBuffer(JNIEnv* env, jobjec
 
 inline bool ParseBool(const std::string& value) { return value == "true" ? 
true : false; }
 
+inline bool ParseChar(const std::string& value) {
+  if (value.size() != 1) {
+    JniThrow("Csv convert option " + value + " should be a char");
+  }
+  return value.at(0);
+}
+
 /// \brief Construct FragmentScanOptions from config map
 #ifdef ARROW_CSV
+
+bool SetCsvConvertOptions(arrow::csv::ConvertOptions& options, const 
std::string& key,
+                          const std::string& value) {
+  if (key == "column_types") {
+    int64_t schema_address = std::stol(value);
+    ArrowSchema* c_schema = reinterpret_cast<ArrowSchema*>(schema_address);
+    auto schema = JniGetOrThrow(arrow::ImportSchema(c_schema));
+    auto& column_types = options.column_types;
+    for (auto field : schema->fields()) {
+      column_types[field->name()] = field->type();
+    }
+  } else if (key == "strings_can_be_null") {
+    options.strings_can_be_null = ParseBool(value);
+  } else if (key == "check_utf8") {
+    options.check_utf8 = ParseBool(value);
+  } else if (key == "null_values") {
+    options.null_values = {value};
+  } else if (key == "true_values") {
+    options.true_values = {value};
+  } else if (key == "false_values") {
+    options.false_values = {value};
+  } else if (key == "quoted_strings_can_be_null") {
+    options.quoted_strings_can_be_null = ParseBool(value);
+  } else if (key == "auto_dict_encode") {
+    options.auto_dict_encode = ParseBool(value);
+  } else if (key == "auto_dict_max_cardinality") {
+    options.auto_dict_max_cardinality = std::stoi(value);
+  } else if (key == "decimal_point") {
+    options.decimal_point = ParseChar(value);
+  } else if (key == "include_missing_columns") {
+    options.include_missing_columns = ParseBool(value);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool SetCsvParseOptions(arrow::csv::ParseOptions& options, const std::string& 
key,
+                        const std::string& value) {
+  if (key == "delimiter") {
+    options.delimiter = ParseChar(value);
+  } else if (key == "quoting") {
+    options.quoting = ParseBool(value);
+  } else if (key == "quote_char") {
+    options.quote_char = ParseChar(value);
+  } else if (key == "double_quote") {
+    options.double_quote = ParseBool(value);
+  } else if (key == "escaping") {
+    options.escaping = ParseBool(value);
+  } else if (key == "escape_char") {
+    options.escape_char = ParseChar(value);
+  } else if (key == "newlines_in_values") {
+    options.newlines_in_values = ParseBool(value);
+  } else if (key == "ignore_empty_lines") {
+    options.ignore_empty_lines = ParseBool(value);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool SetCsvReadOptions(arrow::csv::ReadOptions& options, const std::string& 
key,
+                       const std::string& value) {
+  if (key == "use_threads") {
+    options.use_threads = ParseBool(value);
+  } else if (key == "block_size") {
+    options.block_size = std::stoi(value);
+  } else if (key == "skip_rows") {
+    options.skip_rows = std::stoi(value);
+  } else if (key == "skip_rows_after_names") {
+    options.skip_rows_after_names = std::stoi(value);
+  } else if (key == "autogenerate_column_names") {
+    options.autogenerate_column_names = ParseBool(value);
+  } else {
+    return false;
+  }
+  return true;
+}
+
 arrow::Result<std::shared_ptr<arrow::dataset::FragmentScanOptions>>
 ToCsvFragmentScanOptions(const std::unordered_map<std::string, std::string>& 
configs) {
   std::shared_ptr<arrow::dataset::CsvFragmentScanOptions> options =
       std::make_shared<arrow::dataset::CsvFragmentScanOptions>();
-  for (auto const& [key, value] : configs) {
-    if (key == "delimiter") {
-      options->parse_options.delimiter = value.data()[0];
-    } else if (key == "quoting") {
-      options->parse_options.quoting = ParseBool(value);
-    } else if (key == "column_types") {
-      int64_t schema_address = std::stol(value);
-      ArrowSchema* c_schema = reinterpret_cast<ArrowSchema*>(schema_address);
-      ARROW_ASSIGN_OR_RAISE(auto schema, arrow::ImportSchema(c_schema));
-      auto& column_types = options->convert_options.column_types;
-      for (auto field : schema->fields()) {
-        column_types[field->name()] = field->type();
-      }
-    } else if (key == "strings_can_be_null") {
-      options->convert_options.strings_can_be_null = ParseBool(value);
-    } else {
+  for (const auto& it : configs) {
+    const auto& key = it.first;
+    const auto& value = it.second;
+    bool setValid = SetCsvParseOptions(options->parse_options, key, value) ||

Review Comment:
   Since we already use JniThrow, we can just use that consistently instead of 
mixing error handling routines here.



##########
java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java:
##########
@@ -32,6 +32,9 @@ public class CsvFragmentScanOptions implements 
FragmentScanOptions {
    * CSV scan options, map to CPP struct CsvFragmentScanOptions. The key in 
config map is the field
    * name of mapping cpp struct
    *
+   * <p>If the option type is a std::vector in the CPP code, only support for 
setting one value. For
+   * example, for convert option null_values, only support set one string as 
null value.
+   *

Review Comment:
   ```suggestion
      * <p>Currently, multi-valued options (which are std::vector values in 
C++) only support having a single value set. For
      * example, for the null_values option, only one string can be set as the 
null value.
      *
   ```



##########
java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java:
##########
@@ -165,4 +165,44 @@ public void testCsvConvertOptionsNoOption() throws 
Exception {
       assertEquals(3, rowCount);
     }
   }
+

Review Comment:
   Ideally we'd have a brief test of each option, I think



##########
java/dataset/src/main/cpp/jni_wrapper.cc:
##########
@@ -368,28 +368,105 @@ std::shared_ptr<arrow::Buffer> 
LoadArrowBufferFromByteBuffer(JNIEnv* env, jobjec
 
 inline bool ParseBool(const std::string& value) { return value == "true" ? 
true : false; }
 
+inline bool ParseChar(const std::string& value) {
+  if (value.size() != 1) {
+    JniThrow("Csv convert option " + value + " should be a char");

Review Comment:
   It would be good to show the invalid key as well



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to