lidavidm commented on code in PR #13859: URL: https://github.com/apache/arrow/pull/13859#discussion_r956296511
########## cpp/examples/tutorial_examples/dataset_example.cc: ########## @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <arrow/api.h> +#include <arrow/dataset/api.h> +#include <parquet/arrow/reader.h> +#include <parquet/arrow/writer.h> + +#include <iostream> + +// Generate some data for the rest of this example. +arrow::Result<std::shared_ptr<arrow::Table>> CreateTable() { + // This code should look familiar from the basic Arrow example, and is not the + // focus of this example. However, we need data to work on it, and this makes that! + auto schema = + arrow::schema({arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64()), + arrow::field("c", arrow::int64())}); + std::shared_ptr<arrow::Array> array_a; + std::shared_ptr<arrow::Array> array_b; + std::shared_ptr<arrow::Array> array_c; + arrow::NumericBuilder<arrow::Int64Type> builder; + ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_a)); + builder.Reset(); + ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_b)); + builder.Reset(); + ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_c)); + return arrow::Table::Make(schema, {array_a, array_b, array_c}); +} + +// Set up a dataset by writing two Parquet files. +arrow::Result<std::string> CreateExampleParquetDataset( + const std::shared_ptr<arrow::fs::FileSystem>& filesystem, + const std::string& root_path) { + // Much like CreateTable(), this is utility that gets us the dataset we'll be reading + // from. Don't worry, we also write a dataset in the example proper. + auto base_path = root_path + "parquet_dataset"; + ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path)); + // Create an Arrow Table + ARROW_ASSIGN_OR_RAISE(auto table, CreateTable()); + // Write it into two Parquet files + ARROW_ASSIGN_OR_RAISE(auto output, + filesystem->OpenOutputStream(base_path + "/data1.parquet")); + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable( + *table->Slice(0, 5), arrow::default_memory_pool(), output, 2048)); + ARROW_ASSIGN_OR_RAISE(output, + filesystem->OpenOutputStream(base_path + "/data2.parquet")); + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable( + *table->Slice(5), arrow::default_memory_pool(), output, 2048)); + return base_path; +} + +arrow::Status PrepareEnv() { + // Get our environment prepared for reading, by setting up some quick writing. + ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable()) + std::shared_ptr<arrow::fs::FileSystem> setup_fs; + // Note this operates in the directory the executable is built in. + char setup_path[256]; + getcwd(setup_path, 256); + ARROW_ASSIGN_OR_RAISE(setup_fs, arrow::fs::FileSystemFromUriOrPath(setup_path)); + ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs, "")); + + return arrow::Status::OK(); +} + +arrow::Status RunMain() { + ARROW_RETURN_NOT_OK(PrepareEnv()); + + // First, we need a filesystem object, which lets us interact with our local + // filesystem starting at a given path. For the sake of simplicity, that'll be + // the current directory. + std::shared_ptr<arrow::fs::FileSystem> fs; + // This feels pretty bad, but I wasn't finding great solutions that're + // system-generic -- could use some advice on how to set this up. + char init_path[256]; + getcwd(init_path, 256); + ARROW_ASSIGN_OR_RAISE(fs, arrow::fs::FileSystemFromUriOrPath(init_path)); + + // A file selector lets us actually traverse a multi-file dataset. + arrow::fs::FileSelector selector; + selector.base_dir = "parquet_dataset"; + // Recursive is a safe bet if you don't know the nesting of your dataset. + selector.recursive = true; + // Making an options object lets us configure our dataset reading. + arrow::dataset::FileSystemFactoryOptions options; + // We'll use Hive-style partitioning. We'll let Arrow Datasets infer the partition + // schema. We won't set any other options, defaults are fine. + options.partitioning = arrow::dataset::HivePartitioning::MakeFactory(); + auto read_format = std::make_shared<arrow::dataset::ParquetFileFormat>(); + // Now, we get a factory that will let us get our dataset -- we don't have the + // dataset yet! + ARROW_ASSIGN_OR_RAISE(auto factory, arrow::dataset::FileSystemDatasetFactory::Make( + fs, selector, read_format, options)); + // Now we read into our dataset from the factory. Review Comment: Construct, build, or make I suppose -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
