[GitHub] [arrow] westonpace commented on a diff in pull request #13859: ARROW-17377: [C++][Docs] Adds tutorial for basic Arrow, file access, compute, and datasets

GitBox Tue, 16 Aug 2022 18:06:13 -0700


westonpace commented on code in PR #13859:
URL: https://github.com/apache/arrow/pull/13859#discussion_r947363500



##########
cpp/examples/tutorial_examples/dataset_example.cc:
##########
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <arrow/dataset/api.h>
+
+#include <iostream>
+
+
+// Generate some data for the rest of this example.
+arrow::Result<std::shared_ptr<arrow::Table>> CreateTable() {
+    //This code should look familiar from the basic Arrow example, and is not 
the
+    //focus of this example. However, we need data to work on it, and this 
makes that!
+    auto schema =
+            arrow::schema({arrow::field("a", arrow::int64()),
+                           arrow::field("b", arrow::int64()),
+                           arrow::field("c", arrow::int64())});
+    std::shared_ptr<arrow::Array> array_a;
+    std::shared_ptr<arrow::Array> array_b;
+    std::shared_ptr<arrow::Array> array_c;
+    arrow::NumericBuilder<arrow::Int64Type> builder;
+    ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_a));
+    builder.Reset();
+    ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_b));
+    builder.Reset();
+    ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_c));
+    return arrow::Table::Make(schema, {array_a, array_b, array_c});
+}
+
+// Set up a dataset by writing two Parquet files.
+arrow::Result<std::string> CreateExampleParquetDataset(
+        const std::shared_ptr<arrow::fs::FileSystem>& filesystem,
+        const std::string& root_path) {
+    //Much like CreateTable(), this is utility that gets us the dataset we'll 
be reading
+    //from. Don't worry, we also write a dataset in the example proper.
+    auto base_path = root_path + "parquet_dataset";
+    ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path));
+    // Create an Arrow Table
+    ARROW_ASSIGN_OR_RAISE(auto table, CreateTable());
+    // Write it into two Parquet files
+    ARROW_ASSIGN_OR_RAISE(auto output,
+                          filesystem->OpenOutputStream(base_path + 
"/data1.parquet"));
+    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
+            *table->Slice(0, 5), arrow::default_memory_pool(), output, 2048));
+    ARROW_ASSIGN_OR_RAISE(output,
+                          filesystem->OpenOutputStream(base_path + 
"/data2.parquet"));
+    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
+            *table->Slice(5), arrow::default_memory_pool(), output, 2048));
+    return base_path;
+}
+
+arrow::Status RunMain() {
+
+    //Get our environment prepared for reading, by setting up some quick 
writing.
+    ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable())
+    std::shared_ptr<arrow::fs::FileSystem> setup_fs;
+    //Note this operates in the directory the executable is built in.
+    char setup_path[256];
+    getcwd(setup_path, 256);
+    ARROW_ASSIGN_OR_RAISE(setup_fs, 
arrow::fs::FileSystemFromUriOrPath(setup_path));
+    ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs,
+                                                                      ""));
+
+    //First, we need a filesystem object, which lets us interact with our local
+    //filesystem starting at a given path. For the sake of simplicity, that'll 
be
+    //the current directory.
+    std::shared_ptr<arrow::fs::FileSystem> fs;
+    //This feels pretty bad, but I wasn't finding great solutions that're
+    //system-generic -- could use some advice on how to set this up.
+    char init_path[256];
+    getcwd(init_path, 256);
+    ARROW_ASSIGN_OR_RAISE(fs, arrow::fs::FileSystemFromUriOrPath(init_path));
+
+    //A file selector lets us actually traverse a multi-file dataset.
+    arrow::fs::FileSelector selector;
+    selector.base_dir = "parquet_dataset";
+    //Recursive is a safe bet if you don't know the nesting of your dataset.
+    selector.recursive = true;
+    //Making an options object lets us configure our dataset reading.
+    arrow::dataset::FileSystemFactoryOptions options;
+    // We'll use Hive-style partitioning. We'll let Arrow Datasets infer the 
partition
+    // schema. We won't set any other options, defaults are fine.
+    options.partitioning = arrow::dataset::HivePartitioning::MakeFactory();
+    auto read_format =
+            std::make_shared<arrow::dataset::ParquetFileFormat>();
+    //Now, we get a factory that will let us get our dataset -- we don't have 
the
+    //dataset yet!
+    auto factory = arrow::dataset::FileSystemDatasetFactory::Make(fs,
+                                                                  selector,
+                                                                  read_format,
+                                                                  options)
+            .ValueOrDie();
+    //Now we read into our dataset from the factory.
+    auto read_dataset = factory->Finish().ValueOrDie();
+    // Print out the fragments
+    for (const auto& fragment : read_dataset->GetFragments().ValueOrDie()) {
+        std::cout << "Found fragment: " << (*fragment)->ToString() << 
std::endl;
+    }
+
+    //Scan dataset into a Table -- once this is done, you can do
+    //normal table things with it, like computation and printing. However, now 
you're
+    //also dedicated to being in memory.
+    auto read_scan_builder = read_dataset->NewScan().ValueOrDie();
+    auto read_scanner = read_scan_builder->Finish().ValueOrDie();
+    std::shared_ptr<arrow::Table> table = read_scanner->ToTable().ValueOrDie();
+    std::cout << table->ToString();
+
+    //Now, let's get a table out as a dataset!

Review Comment:
   At this point we don't actually "scan" the scanner.  We simply need it for 
the options :grimacing:  It would be pretty easy (and rather useful) to change:
   
   ```
   Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& 
write_options,
                                   std::shared_ptr<Scanner> scanner)
   ```
   
   to:
   
   ```
   Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& 
write_options, std::shared_ptr<ScanOptions> options, std::shared_ptr<Dataset> 
dataset)
   ```
   
   Even better, we should create (untested but should work):
   
   ```
   Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& 
write_options,
                                   std::shared_ptr<Table> table) {
     std::shared_ptr<compute::ExecContext> exec_context =
         std::make_shared<compute::ExecContext>();
   
     ARROW_ASSIGN_OR_RAISE(auto plan, 
compute::ExecPlan::Make(exec_context.get()));
   
     RETURN_NOT_OK(
         compute::Declaration::Sequence(
             {
                 {"table_source", 
compute::TableSourceNodeOptions{std::move(table)}},
                 {"write", WriteNodeOptions{write_options}},
             })
             .AddToPlan(plan.get()));
   
     RETURN_NOT_OK(plan->StartProducing());
     return plan->finished().status();
   }
   ```
   
   Creating a dataset from a scanner (or rather, scan_options) makes a lot more 
sense when you consider the fact that the source might be a larger-than-memory 
dataset (this is very common when re-partitioning a dataset, or converting a 
dataset from CSV to parquet for example).
   
   However, we definitely do not need to be involving something as complex as a 
scan node to write an in-memory table to a dataset.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] westonpace commented on a diff in pull request #13859: ARROW-17377: [C++][Docs] Adds tutorial for basic Arrow, file access, compute, and datasets

Reply via email to