vibhatha commented on code in PR #41646:
URL: https://github.com/apache/arrow/pull/41646#discussion_r1645283246


##########
cpp/src/arrow/engine/substrait/serde.h:
##########
@@ -183,6 +184,13 @@ ARROW_ENGINE_EXPORT Result<BoundExpressions> 
DeserializeExpressions(
     const ConversionOptions& conversion_options = {},
     ExtensionSet* ext_set_out = NULLPTR);
 
+/// \brief Deserialize a Substrait Literal message to the map
+///
+/// \param[in] buf a buffer containing the protobuf serialization of a 
Substrait Literal
+/// \param[out] out deserialize to this map.
+ARROW_ENGINE_EXPORT Status
+DeserializeMap(const Buffer& buf, std::unordered_map<std::string, 
std::string>& out);

Review Comment:
   Can't we just pass the Java Map and reconstruct that using JNI utils in C++? 



##########
java/dataset/src/main/cpp/jni_wrapper.cc:
##########
@@ -363,6 +364,64 @@ std::shared_ptr<arrow::Buffer> 
LoadArrowBufferFromByteBuffer(JNIEnv* env, jobjec
   return buffer;
 }
 
+inline bool ParseBool(const std::string& value) { return value == "true" ? 
true : false; }
+
+/// \brief Construct FragmentScanOptions from config map
+#ifdef ARROW_CSV
+arrow::Result<std::shared_ptr<arrow::dataset::FragmentScanOptions>>
+ToCsvFragmentScanOptions(const std::unordered_map<std::string, std::string>& 
configs) {
+  std::shared_ptr<arrow::dataset::CsvFragmentScanOptions> options =
+      std::make_shared<arrow::dataset::CsvFragmentScanOptions>();
+  for (auto const& it : configs) {
+    auto& key = it.first;
+    auto& value = it.second;
+    if (key == "delimiter") {
+      options->parse_options.delimiter = value.data()[0];
+    } else if (key == "quoting") {
+      options->parse_options.quoting = ParseBool(value);
+    } else if (key == "column_types") {
+      int64_t schema_address = std::stol(value);
+      ArrowSchema* cSchema = reinterpret_cast<ArrowSchema*>(schema_address);
+      ARROW_ASSIGN_OR_RAISE(auto schema, arrow::ImportSchema(cSchema));
+      auto& column_types = options->convert_options.column_types;
+      for (auto field : schema->fields()) {
+        column_types[field->name()] = field->type();
+      }
+    } else if (key == "strings_can_be_null") {
+      options->convert_options.strings_can_be_null = ParseBool(value);
+    } else {
+      return arrow::Status::Invalid("Config " + it.first + " is not 
supported.");
+    }
+  }
+  return options;
+}
+#endif
+
+arrow::Result<std::shared_ptr<arrow::dataset::FragmentScanOptions>>
+GetFragmentScanOptions(jint file_format_id,
+                       const std::unordered_map<std::string, std::string>& 
configs) {
+  switch (file_format_id) {
+#ifdef ARROW_CSV
+    case 3:
+      return ToCsvFragmentScanOptions(configs);
+#endif
+    default:
+      return arrow::Status::Invalid("Illegal file format id: ", 
file_format_id);
+  }
+}
+
+std::unordered_map<std::string, std::string> ToStringMap(JNIEnv* env,

Review Comment:
   Maybe we can just take the Java Map and create the C++ Map using JNI, rather 
than using String[]? 
   cc @lidavidm 



##########
java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java:
##########
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.dataset.scanner.csv;
+
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import org.apache.arrow.dataset.file.FileFormat;
+import org.apache.arrow.dataset.scanner.FragmentScanOptions;
+
+public class CsvFragmentScanOptions implements Serializable, 
FragmentScanOptions {
+  private final CsvConvertOptions convertOptions;
+  private final Map<String, String> readOptions;
+  private final Map<String, String> parseOptions;
+
+  /**
+   * csv scan options, map to CPP struct CsvFragmentScanOptions.
+   *
+   * @param convertOptions same struct in CPP

Review Comment:
   Maybe we should use different wording than `same struct in CPP` but describe 
the variable? 



##########
java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java:
##########
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.dataset.scanner.csv;
+
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import org.apache.arrow.dataset.file.FileFormat;
+import org.apache.arrow.dataset.scanner.FragmentScanOptions;
+
+public class CsvFragmentScanOptions implements Serializable, 
FragmentScanOptions {
+  private final CsvConvertOptions convertOptions;
+  private final Map<String, String> readOptions;
+  private final Map<String, String> parseOptions;
+
+  /**
+   * csv scan options, map to CPP struct CsvFragmentScanOptions.
+   *
+   * @param convertOptions same struct in CPP
+   * @param readOptions same struct in CPP
+   * @param parseOptions same struct in CPP
+   */
+  public CsvFragmentScanOptions(
+      CsvConvertOptions convertOptions,
+      Map<String, String> readOptions,
+      Map<String, String> parseOptions) {
+    this.convertOptions = convertOptions;
+    this.readOptions = readOptions;
+    this.parseOptions = parseOptions;
+  }
+
+  public String typeName() {
+    return FileFormat.CSV.name().toLowerCase(Locale.ROOT);
+  }
+
+  /**
+   * File format id.
+   *
+   * @return id
+   */
+  public int fileFormatId() {
+    return FileFormat.CSV.id();
+  }
+
+  /**
+   * Serialize this class to ByteBuffer and then called by jni call.

Review Comment:
   ```suggestion
      * Serialize this class to ByteBuffer and then called by JNI call.
   ```



##########
java/dataset/src/main/java/org/apache/arrow/dataset/scanner/csv/CsvFragmentScanOptions.java:
##########
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.dataset.scanner.csv;
+
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import org.apache.arrow.dataset.file.FileFormat;
+import org.apache.arrow.dataset.scanner.FragmentScanOptions;
+
+public class CsvFragmentScanOptions implements Serializable, 
FragmentScanOptions {
+  private final CsvConvertOptions convertOptions;
+  private final Map<String, String> readOptions;
+  private final Map<String, String> parseOptions;
+
+  /**
+   * csv scan options, map to CPP struct CsvFragmentScanOptions.
+   *
+   * @param convertOptions same struct in CPP
+   * @param readOptions same struct in CPP
+   * @param parseOptions same struct in CPP
+   */
+  public CsvFragmentScanOptions(
+      CsvConvertOptions convertOptions,
+      Map<String, String> readOptions,
+      Map<String, String> parseOptions) {
+    this.convertOptions = convertOptions;
+    this.readOptions = readOptions;
+    this.parseOptions = parseOptions;
+  }
+
+  public String typeName() {
+    return FileFormat.CSV.name().toLowerCase(Locale.ROOT);
+  }
+
+  /**
+   * File format id.
+   *
+   * @return id
+   */
+  public int fileFormatId() {
+    return FileFormat.CSV.id();
+  }
+
+  /**
+   * Serialize this class to ByteBuffer and then called by jni call.
+   *
+   * @return DirectByteBuffer

Review Comment:
   We are returning a `String[]` right?



##########
java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java:
##########
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.dataset.scanner;
+
+import java.util.Map;
+
+public interface FragmentScanOptions {
+  String typeName();
+
+  int fileFormatId();
+
+  String[] serialize();
+
+  /**
+   * Serialize the map to string array.
+   *
+   * @param config config map
+   * @return string array for serialization
+   */
+  default String[] serializeMap(Map<String, String> config) {

Review Comment:
   Then, maybe `convertMapToStringArray` or something? 



##########
java/dataset/src/test/java/org/apache/arrow/dataset/TestFragmentScanOptions.java:
##########
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.dataset;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import com.google.common.collect.ImmutableMap;
+import java.util.Arrays;
+import java.util.Optional;
+import org.apache.arrow.c.ArrowSchema;
+import org.apache.arrow.c.CDataDictionaryProvider;
+import org.apache.arrow.c.Data;
+import org.apache.arrow.dataset.file.FileFormat;
+import org.apache.arrow.dataset.file.FileSystemDatasetFactory;
+import org.apache.arrow.dataset.jni.NativeMemoryPool;
+import org.apache.arrow.dataset.scanner.ScanOptions;
+import org.apache.arrow.dataset.scanner.Scanner;
+import org.apache.arrow.dataset.scanner.csv.CsvConvertOptions;
+import org.apache.arrow.dataset.scanner.csv.CsvFragmentScanOptions;
+import org.apache.arrow.dataset.source.Dataset;
+import org.apache.arrow.dataset.source.DatasetFactory;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.ipc.ArrowReader;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.Schema;
+import org.junit.jupiter.api.Test;
+
+public class TestFragmentScanOptions {
+
+  @Test
+  public void testCsvConvertOptions() throws Exception {
+    final Schema schema =
+        new Schema(
+            Arrays.asList(
+                Field.nullable("Id", new ArrowType.Int(32, true)),
+                Field.nullable("Name", new ArrowType.Utf8()),
+                Field.nullable("Language", new ArrowType.Utf8())),
+            null);
+    String path = "file://" + getClass().getResource("/").getPath() + 
"/data/student.csv";
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (ArrowSchema cSchema = ArrowSchema.allocateNew(allocator);
+        CDataDictionaryProvider provider = new CDataDictionaryProvider()) {
+      Data.exportSchema(allocator, schema, provider, cSchema);
+      CsvConvertOptions convertOptions = new 
CsvConvertOptions(ImmutableMap.of("delimiter", ";"));
+      convertOptions.setArrowSchema(cSchema);
+      CsvFragmentScanOptions fragmentScanOptions =

Review Comment:
   should we also test with some configs for `CsvFragmentScanOptions`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to