lidavidm commented on code in PR #13859: URL: https://github.com/apache/arrow/pull/13859#discussion_r956266188
########## cpp/examples/tutorial_examples/dataset_example.cc: ########## @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <arrow/api.h> +#include <arrow/dataset/api.h> +#include <parquet/arrow/reader.h> +#include <parquet/arrow/writer.h> + +#include <iostream> + +// Generate some data for the rest of this example. +arrow::Result<std::shared_ptr<arrow::Table>> CreateTable() { + // This code should look familiar from the basic Arrow example, and is not the + // focus of this example. However, we need data to work on it, and this makes that! + auto schema = + arrow::schema({arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64()), + arrow::field("c", arrow::int64())}); + std::shared_ptr<arrow::Array> array_a; + std::shared_ptr<arrow::Array> array_b; + std::shared_ptr<arrow::Array> array_c; + arrow::NumericBuilder<arrow::Int64Type> builder; + ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_a)); + builder.Reset(); + ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_b)); + builder.Reset(); + ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_c)); + return arrow::Table::Make(schema, {array_a, array_b, array_c}); +} + +// Set up a dataset by writing two Parquet files. +arrow::Result<std::string> CreateExampleParquetDataset( + const std::shared_ptr<arrow::fs::FileSystem>& filesystem, + const std::string& root_path) { + // Much like CreateTable(), this is utility that gets us the dataset we'll be reading + // from. Don't worry, we also write a dataset in the example proper. + auto base_path = root_path + "parquet_dataset"; + ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path)); + // Create an Arrow Table + ARROW_ASSIGN_OR_RAISE(auto table, CreateTable()); + // Write it into two Parquet files + ARROW_ASSIGN_OR_RAISE(auto output, + filesystem->OpenOutputStream(base_path + "/data1.parquet")); + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable( + *table->Slice(0, 5), arrow::default_memory_pool(), output, 2048)); + ARROW_ASSIGN_OR_RAISE(output, + filesystem->OpenOutputStream(base_path + "/data2.parquet")); + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable( + *table->Slice(5), arrow::default_memory_pool(), output, 2048)); + return base_path; +} + +arrow::Status PrepareEnv() { + // Get our environment prepared for reading, by setting up some quick writing. + ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable()) + std::shared_ptr<arrow::fs::FileSystem> setup_fs; + // Note this operates in the directory the executable is built in. + char setup_path[256]; + getcwd(setup_path, 256); + ARROW_ASSIGN_OR_RAISE(setup_fs, arrow::fs::FileSystemFromUriOrPath(setup_path)); + ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs, "")); + + return arrow::Status::OK(); +} + +arrow::Status RunMain() { + ARROW_RETURN_NOT_OK(PrepareEnv()); + + // First, we need a filesystem object, which lets us interact with our local + // filesystem starting at a given path. For the sake of simplicity, that'll be + // the current directory. + std::shared_ptr<arrow::fs::FileSystem> fs; + // This feels pretty bad, but I wasn't finding great solutions that're + // system-generic -- could use some advice on how to set this up. Review Comment: just to make sure - does it not work to use '.'? ########## cpp/examples/tutorial_examples/file_access_example.cc: ########## @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <arrow/api.h> +#include <arrow/csv/api.h> +#include <arrow/io/api.h> +#include <arrow/ipc/api.h> +#include <parquet/arrow/reader.h> +#include <parquet/arrow/writer.h> + +#include <iostream> + +arrow::Status GenInitialFile() { + // Make a couple 8-bit integer arrays and a 16-bit integer array -- just like + // basic Arrow example. + arrow::Int8Builder int8builder; + int8_t days_raw[5] = {1, 12, 17, 23, 28}; + ARROW_RETURN_NOT_OK(int8builder.AppendValues(days_raw, 5)); + std::shared_ptr<arrow::Array> days; + ARROW_ASSIGN_OR_RAISE(days, int8builder.Finish()); + + int8_t months_raw[5] = {1, 3, 5, 7, 1}; + ARROW_RETURN_NOT_OK(int8builder.AppendValues(months_raw, 5)); + std::shared_ptr<arrow::Array> months; + ARROW_ASSIGN_OR_RAISE(months, int8builder.Finish()); + + arrow::Int16Builder int16builder; + int16_t years_raw[5] = {1990, 2000, 1995, 2000, 1995}; + ARROW_RETURN_NOT_OK(int16builder.AppendValues(years_raw, 5)); + std::shared_ptr<arrow::Array> years; + ARROW_ASSIGN_OR_RAISE(years, int16builder.Finish()); + + // Get a vector of our Arrays + std::vector<std::shared_ptr<arrow::Array>> columns = {days, months, years}; + + // Make a schema to initialize the Table with + std::shared_ptr<arrow::Field> field_day, field_month, field_year; + std::shared_ptr<arrow::Schema> schema; + + field_day = arrow::field("Day", arrow::int8()); + field_month = arrow::field("Month", arrow::int8()); + field_year = arrow::field("Year", arrow::int16()); + + schema = arrow::schema({field_day, field_month, field_year}); + // With the schema and data, create a Table + std::shared_ptr<arrow::Table> table; + table = arrow::Table::Make(schema, columns); + + // Write out test files in IPC, CSV, and Parquet for the example to use. + std::shared_ptr<arrow::io::FileOutputStream> outfile; + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.ipc")); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::ipc::RecordBatchWriter> ipc_writer, + arrow::ipc::MakeFileWriter(outfile, schema)); + ARROW_RETURN_NOT_OK(ipc_writer->WriteTable(*table)); + ARROW_RETURN_NOT_OK(ipc_writer->Close()); + + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.csv")); + ARROW_ASSIGN_OR_RAISE(auto csv_writer, + arrow::csv::MakeCSVWriter(outfile, table->schema())); + ARROW_RETURN_NOT_OK(csv_writer->WriteTable(*table)); + ARROW_RETURN_NOT_OK(csv_writer->Close()); + + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.parquet")); + PARQUET_THROW_NOT_OK( + parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), outfile, 5)); + + return arrow::Status::OK(); +} + +arrow::Status RunMain() { + // Generate initial files for each format with a helper function -- don't worry, + // we'll also write a table in this example. + ARROW_RETURN_NOT_OK(GenInitialFile()); + + // Reading and writing from files + + // First, we have to set up a ReadableFile object, which just lets us point our + // readers to the right data on disk. We'll be reusing this object, and rebinding + // it to multiple files throughout the example. + std::shared_ptr<arrow::io::ReadableFile> infile; + // Get "test_in.ipc" into our file pointer + ARROW_ASSIGN_OR_RAISE( + infile, arrow::io::ReadableFile::Open("test_in.ipc", arrow::default_memory_pool())); + // Open up the file with the IPC features of the library, gives us a reader object. + ARROW_ASSIGN_OR_RAISE(auto ipc_reader, arrow::ipc::RecordBatchFileReader::Open(infile)); + // Using the reader, we can read Record Batches. Note that this is specific to IPC; + // for other formats, we focus on Tables, but here, RecordBatches are used. + std::shared_ptr<arrow::RecordBatch> rbatch; + ARROW_ASSIGN_OR_RAISE(rbatch, ipc_reader->ReadRecordBatch(0)); + + // Just like with input, we get an object for the output file. + std::shared_ptr<arrow::io::FileOutputStream> outfile; + // Bind it to "test_out.ipc" Review Comment: The official extension is `.arrow`, FWIW ########## cpp/examples/tutorial_examples/compute_example.cc: ########## @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <arrow/api.h> +#include <arrow/compute/api.h> + +#include <iostream> + +arrow::Status RunMain() { + // Create a couple 32-bit integer arrays. + arrow::Int32Builder int32builder; + int32_t some_nums_raw[5] = {34, 624, 2223, 5654, 4356}; + ARROW_RETURN_NOT_OK(int32builder.AppendValues(some_nums_raw, 5)); + std::shared_ptr<arrow::Array> some_nums; + ARROW_ASSIGN_OR_RAISE(some_nums, int32builder.Finish()); + + int32_t more_nums_raw[5] = {75342, 23, 64, 17, 736}; + ARROW_RETURN_NOT_OK(int32builder.AppendValues(more_nums_raw, 5)); + std::shared_ptr<arrow::Array> more_nums; + ARROW_ASSIGN_OR_RAISE(more_nums, int32builder.Finish()); + + // Make a table out of our pair of arrays. + std::shared_ptr<arrow::Field> field_a, field_b; + std::shared_ptr<arrow::Schema> schema; + + field_a = arrow::field("A", arrow::int32()); + field_b = arrow::field("B", arrow::int32()); + + schema = arrow::schema({field_a, field_b}); + + std::shared_ptr<arrow::Table> table; Review Comment: nit, but is there any value to having the table in this example? ########## cpp/examples/tutorial_examples/dataset_example.cc: ########## @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <arrow/api.h> +#include <arrow/dataset/api.h> +#include <parquet/arrow/reader.h> +#include <parquet/arrow/writer.h> + +#include <iostream> + +// Generate some data for the rest of this example. +arrow::Result<std::shared_ptr<arrow::Table>> CreateTable() { + // This code should look familiar from the basic Arrow example, and is not the + // focus of this example. However, we need data to work on it, and this makes that! + auto schema = + arrow::schema({arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64()), + arrow::field("c", arrow::int64())}); + std::shared_ptr<arrow::Array> array_a; + std::shared_ptr<arrow::Array> array_b; + std::shared_ptr<arrow::Array> array_c; + arrow::NumericBuilder<arrow::Int64Type> builder; + ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_a)); + builder.Reset(); + ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_b)); + builder.Reset(); + ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2})); + ARROW_RETURN_NOT_OK(builder.Finish(&array_c)); + return arrow::Table::Make(schema, {array_a, array_b, array_c}); +} + +// Set up a dataset by writing two Parquet files. +arrow::Result<std::string> CreateExampleParquetDataset( + const std::shared_ptr<arrow::fs::FileSystem>& filesystem, + const std::string& root_path) { + // Much like CreateTable(), this is utility that gets us the dataset we'll be reading + // from. Don't worry, we also write a dataset in the example proper. + auto base_path = root_path + "parquet_dataset"; + ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path)); + // Create an Arrow Table + ARROW_ASSIGN_OR_RAISE(auto table, CreateTable()); + // Write it into two Parquet files + ARROW_ASSIGN_OR_RAISE(auto output, + filesystem->OpenOutputStream(base_path + "/data1.parquet")); + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable( + *table->Slice(0, 5), arrow::default_memory_pool(), output, 2048)); + ARROW_ASSIGN_OR_RAISE(output, + filesystem->OpenOutputStream(base_path + "/data2.parquet")); + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable( + *table->Slice(5), arrow::default_memory_pool(), output, 2048)); + return base_path; +} + +arrow::Status PrepareEnv() { + // Get our environment prepared for reading, by setting up some quick writing. + ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable()) + std::shared_ptr<arrow::fs::FileSystem> setup_fs; + // Note this operates in the directory the executable is built in. + char setup_path[256]; + getcwd(setup_path, 256); + ARROW_ASSIGN_OR_RAISE(setup_fs, arrow::fs::FileSystemFromUriOrPath(setup_path)); + ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs, "")); + + return arrow::Status::OK(); +} + +arrow::Status RunMain() { + ARROW_RETURN_NOT_OK(PrepareEnv()); + + // First, we need a filesystem object, which lets us interact with our local + // filesystem starting at a given path. For the sake of simplicity, that'll be + // the current directory. + std::shared_ptr<arrow::fs::FileSystem> fs; + // This feels pretty bad, but I wasn't finding great solutions that're + // system-generic -- could use some advice on how to set this up. + char init_path[256]; + getcwd(init_path, 256); + ARROW_ASSIGN_OR_RAISE(fs, arrow::fs::FileSystemFromUriOrPath(init_path)); + + // A file selector lets us actually traverse a multi-file dataset. + arrow::fs::FileSelector selector; + selector.base_dir = "parquet_dataset"; + // Recursive is a safe bet if you don't know the nesting of your dataset. + selector.recursive = true; + // Making an options object lets us configure our dataset reading. + arrow::dataset::FileSystemFactoryOptions options; + // We'll use Hive-style partitioning. We'll let Arrow Datasets infer the partition + // schema. We won't set any other options, defaults are fine. + options.partitioning = arrow::dataset::HivePartitioning::MakeFactory(); + auto read_format = std::make_shared<arrow::dataset::ParquetFileFormat>(); + // Now, we get a factory that will let us get our dataset -- we don't have the + // dataset yet! + ARROW_ASSIGN_OR_RAISE(auto factory, arrow::dataset::FileSystemDatasetFactory::Make( + fs, selector, read_format, options)); + // Now we read into our dataset from the factory. Review Comment: The wording is a bit odd to me here since there isn't any data being read - the factory is a factory/builder in Java/OOP parlance -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
