lidavidm commented on code in PR #13859:
URL: https://github.com/apache/arrow/pull/13859#discussion_r956298142


##########
cpp/examples/tutorial_examples/dataset_example.cc:
##########
@@ -0,0 +1,177 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <arrow/dataset/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+
+#include <iostream>
+
+// Generate some data for the rest of this example.
+arrow::Result<std::shared_ptr<arrow::Table>> CreateTable() {
+  // This code should look familiar from the basic Arrow example, and is not 
the
+  // focus of this example. However, we need data to work on it, and this 
makes that!
+  auto schema =
+      arrow::schema({arrow::field("a", arrow::int64()), arrow::field("b", 
arrow::int64()),
+                     arrow::field("c", arrow::int64())});
+  std::shared_ptr<arrow::Array> array_a;
+  std::shared_ptr<arrow::Array> array_b;
+  std::shared_ptr<arrow::Array> array_c;
+  arrow::NumericBuilder<arrow::Int64Type> builder;
+  ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
+  ARROW_RETURN_NOT_OK(builder.Finish(&array_a));
+  builder.Reset();
+  ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0}));
+  ARROW_RETURN_NOT_OK(builder.Finish(&array_b));
+  builder.Reset();
+  ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2}));
+  ARROW_RETURN_NOT_OK(builder.Finish(&array_c));
+  return arrow::Table::Make(schema, {array_a, array_b, array_c});
+}
+
+// Set up a dataset by writing two Parquet files.
+arrow::Result<std::string> CreateExampleParquetDataset(
+    const std::shared_ptr<arrow::fs::FileSystem>& filesystem,
+    const std::string& root_path) {
+  // Much like CreateTable(), this is utility that gets us the dataset we'll 
be reading
+  // from. Don't worry, we also write a dataset in the example proper.
+  auto base_path = root_path + "parquet_dataset";
+  ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path));
+  // Create an Arrow Table
+  ARROW_ASSIGN_OR_RAISE(auto table, CreateTable());
+  // Write it into two Parquet files
+  ARROW_ASSIGN_OR_RAISE(auto output,
+                        filesystem->OpenOutputStream(base_path + 
"/data1.parquet"));
+  ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
+      *table->Slice(0, 5), arrow::default_memory_pool(), output, 2048));
+  ARROW_ASSIGN_OR_RAISE(output,
+                        filesystem->OpenOutputStream(base_path + 
"/data2.parquet"));
+  ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
+      *table->Slice(5), arrow::default_memory_pool(), output, 2048));
+  return base_path;
+}
+
+arrow::Status PrepareEnv() {
+  // Get our environment prepared for reading, by setting up some quick 
writing.
+  ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable())
+  std::shared_ptr<arrow::fs::FileSystem> setup_fs;
+  // Note this operates in the directory the executable is built in.
+  char setup_path[256];
+  getcwd(setup_path, 256);
+  ARROW_ASSIGN_OR_RAISE(setup_fs, 
arrow::fs::FileSystemFromUriOrPath(setup_path));
+  ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs, 
""));
+
+  return arrow::Status::OK();
+}
+
+arrow::Status RunMain() {
+  ARROW_RETURN_NOT_OK(PrepareEnv());
+
+  // First, we need a filesystem object, which lets us interact with our local
+  // filesystem starting at a given path. For the sake of simplicity, that'll 
be
+  // the current directory.
+  std::shared_ptr<arrow::fs::FileSystem> fs;
+  // This feels pretty bad, but I wasn't finding great solutions that're
+  // system-generic -- could use some advice on how to set this up.

Review Comment:
   I think that's fine unless someone else comes up with something better



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to