This is an automated email from the ASF dual-hosted git repository. lixueclaire pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/incubator-graphar.git
The following commit(s) were added to refs/heads/main by this push: new 883d7eb0 feat(c++): support read and write multi-property (#719) 883d7eb0 is described below commit 883d7eb05b71e7b349c8f735c911bc9079bbe313 Author: Xiaokang Yang <81174897+yang...@users.noreply.github.com> AuthorDate: Mon Jul 21 13:41:06 2025 +0800 feat(c++): support read and write multi-property (#719) --- cpp/CMakeLists.txt | 1 + cpp/src/graphar/arrow/chunk_reader.cc | 7 +- cpp/src/graphar/arrow/chunk_writer.cc | 13 +-- cpp/src/graphar/fwd.h | 2 + cpp/src/graphar/graph_info.cc | 43 +++++++- cpp/src/graphar/graph_info.h | 10 +- cpp/src/graphar/types.h | 39 +++++++ cpp/test/test_info.cc | 116 ++++++++++++++++---- cpp/test/test_multi_label.cc | 20 ++-- cpp/test/test_multi_property.cc | 161 ++++++++++++++++++++++++++++ docs/specification/implementation-status.md | 2 + 11 files changed, 368 insertions(+), 46 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9d9f8736..5fb6c142 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -566,6 +566,7 @@ if (BUILD_TESTS) add_test(test_arrow_chunk_reader SRCS test/test_arrow_chunk_reader.cc) add_test(test_graph SRCS test/test_graph.cc) add_test(test_multi_label SRCS test/test_multi_label.cc) + add_test(test_multi_property SRCS test/test_multi_property.cc) # enable_testing() endif() diff --git a/cpp/src/graphar/arrow/chunk_reader.cc b/cpp/src/graphar/arrow/chunk_reader.cc index a5c77cd2..1b5200fc 100644 --- a/cpp/src/graphar/arrow/chunk_reader.cc +++ b/cpp/src/graphar/arrow/chunk_reader.cc @@ -49,8 +49,11 @@ Result<std::shared_ptr<arrow::Schema>> PropertyGroupToSchema( GeneralParams::kVertexIndexCol, arrow::int64())); } for (const auto& prop : pg->GetProperties()) { - fields.push_back(std::make_shared<arrow::Field>( - prop.name, DataType::DataTypeToArrowDataType(prop.type))); + auto dataType = DataType::DataTypeToArrowDataType(prop.type); + if (prop.cardinality != Cardinality::SINGLE) { + dataType = arrow::list(dataType); + } + fields.push_back(std::make_shared<arrow::Field>(prop.name, dataType)); } return arrow::schema(fields); } diff --git a/cpp/src/graphar/arrow/chunk_writer.cc b/cpp/src/graphar/arrow/chunk_writer.cc index 234bbfb3..c8612383 100644 --- a/cpp/src/graphar/arrow/chunk_writer.cc +++ b/cpp/src/graphar/arrow/chunk_writer.cc @@ -321,17 +321,6 @@ Status VertexPropertyWriter::WriteTable( return Status::OK(); } -// Helper function to split a string by a delimiter -std::vector<std::string> SplitString(const std::string& str, char delimiter) { - std::vector<std::string> tokens; - std::string token; - std::istringstream tokenStream(str); - while (std::getline(tokenStream, token, delimiter)) { - tokens.push_back(token); - } - return tokens; -} - Status VertexPropertyWriter::WriteLabelTable( const std::shared_ptr<arrow::Table>& input_table, IdType start_chunk_index, FileType file_type, ValidateLevel validate_level) const { @@ -379,6 +368,8 @@ Result<std::shared_ptr<arrow::Table>> VertexPropertyWriter::GetLabelTable( auto label_column = std::static_pointer_cast<arrow::StringArray>(chunk); // Populate the matrix based on :LABEL column values + // TODO(@yangxk): store array in the label_column, split the string when + // reading file for (int64_t row = 0; row < label_column->length(); ++row) { if (label_column->IsValid(row)) { std::string labels_string = label_column->GetString(row); diff --git a/cpp/src/graphar/fwd.h b/cpp/src/graphar/fwd.h index f7b5d34a..e7feefd0 100644 --- a/cpp/src/graphar/fwd.h +++ b/cpp/src/graphar/fwd.h @@ -72,6 +72,8 @@ class FileSystem; using IdType = int64_t; enum class Type; class DataType; +/** Defines how multiple values are handled for a given property key */ +enum Cardinality { SINGLE, LIST, SET }; /** Type of file format */ enum FileType { CSV = 0, PARQUET = 1, ORC = 2, JSON = 3 }; enum SelectType { PROPERTIES = 0, LABELS = 1 }; diff --git a/cpp/src/graphar/graph_info.cc b/cpp/src/graphar/graph_info.cc index 81d1bba7..9959f194 100644 --- a/cpp/src/graphar/graph_info.cc +++ b/cpp/src/graphar/graph_info.cc @@ -20,6 +20,7 @@ #include <unordered_set> #include <utility> +#include "graphar/status.h" #include "mini-yaml/yaml/Yaml.hpp" #include "graphar/filesystem.h" @@ -86,7 +87,8 @@ std::string BuildPath(const std::vector<std::string>& paths) { bool operator==(const Property& lhs, const Property& rhs) { return (lhs.name == rhs.name) && (lhs.type == rhs.type) && (lhs.is_primary == rhs.is_primary) && - (lhs.is_nullable == rhs.is_nullable); + (lhs.is_nullable == rhs.is_nullable) && + (lhs.cardinality == rhs.cardinality); } PropertyGroup::PropertyGroup(const std::vector<Property>& properties, @@ -138,6 +140,11 @@ bool PropertyGroup::IsValidated() const { // list type is not supported in csv file return false; } + // TODO(@yangxk): support cardinality in csv file + if (p.cardinality != Cardinality::SINGLE && file_type_ == FileType::CSV) { + // list cardinality is not supported in csv file + return false; + } } return true; } @@ -212,6 +219,7 @@ class VertexInfo::Impl { property_name_to_primary_.emplace(p.name, p.is_primary); property_name_to_nullable_.emplace(p.name, p.is_nullable); property_name_to_type_.emplace(p.name, p.type); + property_name_to_cardinality_.emplace(p.name, p.cardinality); } } } @@ -251,6 +259,7 @@ class VertexInfo::Impl { std::unordered_map<std::string, bool> property_name_to_nullable_; std::unordered_map<std::string, std::shared_ptr<DataType>> property_name_to_type_; + std::unordered_map<std::string, Cardinality> property_name_to_cardinality_; }; VertexInfo::VertexInfo(const std::string& type, IdType chunk_size, @@ -363,6 +372,15 @@ Result<std::shared_ptr<DataType>> VertexInfo::GetPropertyType( return it->second; } +Result<Cardinality> VertexInfo::GetPropertyCardinality( + const std::string& property_name) const { + auto it = impl_->property_name_to_cardinality_.find(property_name); + if (it == impl_->property_name_to_cardinality_.end()) { + return Status::Invalid("property name not found: ", property_name); + } + return it->second; +} + Result<std::shared_ptr<VertexInfo>> VertexInfo::AddPropertyGroup( std::shared_ptr<PropertyGroup> property_group) const { if (property_group == nullptr) { @@ -440,8 +458,13 @@ Result<std::shared_ptr<VertexInfo>> VertexInfo::Load( bool is_primary = p_node["is_primary"].As<bool>(); bool is_nullable = p_node["is_nullable"].IsNone() || p_node["is_nullable"].As<bool>(); + Cardinality cardinality = Cardinality::SINGLE; + if (!p_node["cardinality"].IsNone()) { + cardinality = + StringToCardinality(p_node["cardinality"].As<std::string>()); + } property_vec.emplace_back(property_name, property_type, is_primary, - is_nullable); + is_nullable, cardinality); } property_groups.push_back( std::make_shared<PropertyGroup>(property_vec, file_type, pg_prefix)); @@ -485,6 +508,9 @@ Result<std::string> VertexInfo::Dump() const noexcept { p_node["data_type"] = p.type->ToTypeName(); p_node["is_primary"] = p.is_primary ? "true" : "false"; p_node["is_nullable"] = p.is_nullable ? "true" : "false"; + if (p.cardinality != Cardinality::SINGLE) { + p_node["cardinality"] = CardinalityToString(p.cardinality); + } pg_node["properties"].PushBack(); pg_node["properties"][pg_node["properties"].Size() - 1] = p_node; } @@ -574,6 +600,13 @@ class EdgeInfo::Impl { } // check if property name is unique in all property groups for (const auto& p : pg->GetProperties()) { + if (p.cardinality != Cardinality::SINGLE) { + // edge property only supports single cardinality + std::cout + << "Edge property only supports single cardinality, but got: " + << CardinalityToString(p.cardinality) << std::endl; + return false; + } if (check_property_unique_set.find(p.name) != check_property_unique_set.end()) { return false; @@ -910,6 +943,12 @@ Result<std::shared_ptr<EdgeInfo>> EdgeInfo::Load(std::shared_ptr<Yaml> yaml) { auto property_name = p_node["name"].As<std::string>(); auto property_type = DataType::TypeNameToDataType(p_node["data_type"].As<std::string>()); + if (!p_node["cardinality"].IsNone() && + StringToCardinality(p_node["cardinality"].As<std::string>()) != + Cardinality::SINGLE) { + return Status::YamlError( + "Unsupported set cardinality for edge property."); + } bool is_primary = p_node["is_primary"].As<bool>(); bool is_nullable = p_node["is_nullable"].IsNone() || p_node["is_nullable"].As<bool>(); diff --git a/cpp/src/graphar/graph_info.h b/cpp/src/graphar/graph_info.h index c067810c..27a487d4 100644 --- a/cpp/src/graphar/graph_info.h +++ b/cpp/src/graphar/graph_info.h @@ -37,16 +37,20 @@ class Property { std::shared_ptr<DataType> type; // property data type bool is_primary; // primary key tag bool is_nullable; // nullable tag for non-primary key + Cardinality + cardinality; // cardinality of the property, only use in vertex info Property() = default; explicit Property(const std::string& name, const std::shared_ptr<DataType>& type = nullptr, - bool is_primary = false, bool is_nullable = true) + bool is_primary = false, bool is_nullable = true, + Cardinality cardinality = Cardinality::SINGLE) : name(name), type(type), is_primary(is_primary), - is_nullable(!is_primary && is_nullable) {} + is_nullable(!is_primary && is_nullable), + cardinality(cardinality) {} }; bool operator==(const Property& lhs, const Property& rhs); @@ -276,6 +280,8 @@ class VertexInfo { Result<std::shared_ptr<DataType>> GetPropertyType( const std::string& property_name) const; + Result<Cardinality> GetPropertyCardinality( + const std::string& property_name) const; /** * Get whether the vertex info contains the specified property. * diff --git a/cpp/src/graphar/types.h b/cpp/src/graphar/types.h index cb468a20..ac318ca4 100644 --- a/cpp/src/graphar/types.h +++ b/cpp/src/graphar/types.h @@ -23,6 +23,7 @@ #include <memory> #include <string> #include <utility> +#include <vector> #include "graphar/fwd.h" #include "graphar/macros.h" @@ -242,4 +243,42 @@ static inline const char* FileTypeToString(FileType file_type) { return file_type2string.at(file_type); } +static inline Cardinality StringToCardinality(const std::string& str) { + static const std::map<std::string, Cardinality> str2cardinality{ + {"single", Cardinality::SINGLE}, + {"list", Cardinality::LIST}, + {"set", Cardinality::SET}, + }; + try { + return str2cardinality.at(str.c_str()); + } catch (const std::exception& e) { + throw std::runtime_error("KeyError: " + str); + } +} + +static inline const char* CardinalityToString(Cardinality cardinality) { + static const std::map<Cardinality, const char*> cardinality2string{ + {Cardinality::SINGLE, "single"}, + {Cardinality::LIST, "list"}, + {Cardinality::SET, "set"}, + }; + try { + return cardinality2string.at(cardinality); + } catch (const std::exception& e) { + throw std::runtime_error("KeyError: " + + std::to_string(static_cast<int>(cardinality))); + } +} + +// Helper function to split a string by a delimiter +inline std::vector<std::string> SplitString(const std::string& str, + char delimiter) { + std::vector<std::string> tokens; + std::string token; + std::istringstream tokenStream(str); + while (std::getline(tokenStream, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} } // namespace graphar diff --git a/cpp/test/test_info.cc b/cpp/test/test_info.cc index 82d839e3..9901ea47 100644 --- a/cpp/test/test_info.cc +++ b/cpp/test/test_info.cc @@ -25,6 +25,8 @@ #include "./util.h" #include "graphar/api/info.h" +#include "graphar/fwd.h" +#include "graphar/status.h" #include <catch2/catch_test_macros.hpp> @@ -75,9 +77,9 @@ TEST_CASE_METHOD(GlobalFixture, "Property") { TEST_CASE_METHOD(GlobalFixture, "PropertyGroup") { Property p0("p0", int32(), true); - Property p1("p1", int32(), false); - Property p2("p2", string(), false); - Property p3("p3", float32(), false); + Property p1("p1", int32(), false, true, Cardinality::SINGLE); + Property p2("p2", string(), false, true, Cardinality::LIST); + Property p3("p3", float32(), false, true, Cardinality::SET); Property p4("p4", float64(), false); PropertyGroup pg0({p0, p1}, FileType::CSV, "p0_and_p1/"); @@ -94,6 +96,11 @@ TEST_CASE_METHOD(GlobalFixture, "PropertyGroup") { REQUIRE(p.type->ToTypeName() == int32()->ToTypeName()); REQUIRE(p.is_primary == true); REQUIRE(p.is_nullable == false); + // cardinality + REQUIRE(p0.cardinality == Cardinality::SINGLE); + REQUIRE(p1.cardinality == Cardinality::SINGLE); + REQUIRE(p2.cardinality == Cardinality::LIST); + REQUIRE(p3.cardinality == Cardinality::SET); } SECTION("FileType") { @@ -180,8 +187,10 @@ TEST_CASE_METHOD(GlobalFixture, "VertexInfo") { int chunk_size = 100; auto version = std::make_shared<InfoVersion>(1); auto pg = CreatePropertyGroup( - {Property("p0", int32(), true), Property("p1", string(), false)}, - FileType::CSV, "p0_p1/"); + {Property("p0", int32(), true), + Property("p1", string(), false, true, Cardinality::LIST), + Property("p2", string(), false, true, Cardinality::SET)}, + FileType::PARQUET, "p0_p1/"); auto vertex_info = CreateVertexInfo(type, chunk_size, {pg}, {}, "test_vertex", version); @@ -206,6 +215,12 @@ TEST_CASE_METHOD(GlobalFixture, "VertexInfo") { REQUIRE(vertex_info->IsPrimaryKey("p1") == false); REQUIRE(vertex_info->IsNullableKey("p0") == false); REQUIRE(vertex_info->IsNullableKey("p1") == true); + REQUIRE(vertex_info->GetPropertyCardinality("p0").value() == + Cardinality::SINGLE); + REQUIRE(vertex_info->GetPropertyCardinality("p1").value() == + Cardinality::LIST); + REQUIRE(vertex_info->GetPropertyCardinality("p2").value() == + Cardinality::SET); REQUIRE(vertex_info->HasProperty("not_exist") == false); REQUIRE(vertex_info->IsPrimaryKey("not_exist") == false); REQUIRE(vertex_info->HasPropertyGroup(nullptr) == false); @@ -221,11 +236,17 @@ TEST_CASE_METHOD(GlobalFixture, "VertexInfo") { SECTION("IsValidate") { REQUIRE(vertex_info->IsValidated() == true); - auto invalid_pg = - CreatePropertyGroup({Property("p0", nullptr, true)}, FileType::CSV); + auto invalid_pg = CreatePropertyGroup( + {Property("p0", list(string()), true, false, Cardinality::SET)}, + FileType::CSV); auto invalid_vertex_info0 = CreateVertexInfo(type, chunk_size, {invalid_pg}, {}, "test_vertex/", version); REQUIRE(invalid_vertex_info0->IsValidated() == false); + invalid_pg = + CreatePropertyGroup({Property("p0", nullptr, true)}, FileType::CSV); + invalid_vertex_info0 = CreateVertexInfo(type, chunk_size, {invalid_pg}, {}, + "test_vertex/", version); + REQUIRE(invalid_vertex_info0->IsValidated() == false); VertexInfo invalid_vertex_info1("", chunk_size, {pg}, {}, "test_vertex/", version); REQUIRE(invalid_vertex_info1.IsValidated() == false); @@ -252,17 +273,23 @@ TEST_CASE_METHOD(GlobalFixture, "VertexInfo") { std::string expected = R"(chunk_size: 100 prefix: test_vertex property_groups: - - file_type: csv + - file_type: parquet prefix: p0_p1/ properties: - data_type: int32 is_nullable: false is_primary: true name: p0 - - data_type: string + - cardinality: list + data_type: string is_nullable: true is_primary: false name: p1 + - cardinality: set + data_type: string + is_nullable: true + is_primary: false + name: p2 type: test_vertex version: gar/v1 )"; @@ -280,22 +307,23 @@ version: gar/v1 } SECTION("AddPropertyGroup") { - auto pg2 = CreatePropertyGroup({Property("p2", int32(), false)}, - FileType::CSV, "p2/"); - auto maybe_extend_info = vertex_info->AddPropertyGroup(pg2); + auto pg3 = CreatePropertyGroup({Property("p3", int32(), false)}, + FileType::CSV, "p3/"); + auto maybe_extend_info = vertex_info->AddPropertyGroup(pg3); REQUIRE(maybe_extend_info.status().ok()); auto extend_info = maybe_extend_info.value(); REQUIRE(extend_info->PropertyGroupNum() == 2); REQUIRE(extend_info->HasProperty("p2") == true); - REQUIRE(extend_info->HasPropertyGroup(pg2) == true); + REQUIRE(extend_info->HasPropertyGroup(pg3) == true); REQUIRE(extend_info->GetPropertyGroups().size() == 2); - REQUIRE(*(extend_info->GetPropertyGroups()[1]) == *pg2); - REQUIRE(extend_info->GetPropertyType("p2").value()->ToTypeName() == + REQUIRE(*(extend_info->GetPropertyGroups()[1]) == *pg3); + REQUIRE(extend_info->GetPropertyType("p3").value()->ToTypeName() == int32()->ToTypeName()); - REQUIRE(extend_info->IsPrimaryKey("p2") == false); - REQUIRE(extend_info->IsNullableKey("p2") == true); - auto extend_info2 = extend_info->AddPropertyGroup(pg2); - REQUIRE(!extend_info2.status().ok()); + REQUIRE(extend_info->IsPrimaryKey("p3") == false); + REQUIRE(extend_info->IsNullableKey("p3") == true); + REQUIRE(extend_info->GetPropertyCardinality("p3") == Cardinality::SINGLE); + auto extend_info3 = extend_info->AddPropertyGroup(pg3); + REQUIRE(!extend_info3.status().ok()); } } @@ -388,6 +416,15 @@ TEST_CASE_METHOD(GlobalFixture, "EdgeInfo") { src_chunk_size, dst_chunk_size, directed, {adj_list}, {invalid_pg}, "test_edge/", version); REQUIRE(invalid_edge_info0->IsValidated() == false); + // edgeInfo does not support list/set cardinality + invalid_pg = CreatePropertyGroup( + {Property("p_cardinality", string(), false, true, Cardinality::LIST)}, + FileType::PARQUET); + auto cardinality_invalid_edge_info0 = + CreateEdgeInfo(src_type, edge_type, dst_type, chunk_size, + src_chunk_size, dst_chunk_size, directed, {adj_list}, + {invalid_pg}, "test_edge/", version); + REQUIRE(invalid_edge_info0->IsValidated() == false); for (int i = 0; i < 3; i++) { std::vector<std::string> types = {src_type, edge_type, dst_type}; types[i] = ""; @@ -691,6 +728,13 @@ property_groups: is_primary: true is_nullable: false file_type: parquet + - properties: + - name: email + data_type: string + is_primary: false + is_nullable: true + cardinality: list + file_type: parquet - properties: - name: firstName data_type: string @@ -750,6 +794,10 @@ extra_info: REQUIRE(vertex_info->GetChunkSize() == 100); REQUIRE(vertex_info->GetPrefix() == "vertex/person/"); REQUIRE(vertex_info->version()->ToString() == "gar/v1"); + REQUIRE(vertex_info->GetPropertyCardinality("id").value() == + Cardinality::SINGLE); + REQUIRE(vertex_info->GetPropertyCardinality("email").value() == + Cardinality::LIST); } SECTION("EdgeInfo::Load") { @@ -761,6 +809,36 @@ extra_info: REQUIRE(edge_info->GetDstType() == "person"); } + edge_info_yaml = R"(src_type: person +edge_type: knows +dst_type: person +chunk_size: 1024 +src_chunk_size: 100 +dst_chunk_size: 100 +directed: false +prefix: edge/person_knows_person/ +adj_lists: + - ordered: false + aligned_by: src + file_type: parquet +property_groups: + - file_type: parquet + properties: + - name: creationDate + data_type: string + is_primary: false + is_nullable: true + cardinality: list +version: gar/v1 +)"; + + SECTION("EdgeInfo::Load with cardinality") { + auto maybe_edge_info = EdgeInfo::Load(edge_info_yaml); + REQUIRE(maybe_edge_info.has_error()); + REQUIRE(maybe_edge_info.status().code() == StatusCode::kYamlError); + std::cout << maybe_edge_info.status().message() << std::endl; + } + SECTION("GraphInfo::Load") { auto maybe_graph_info = GraphInfo::Load(graph_info_yaml, "/"); std::cout << maybe_graph_info.status().message() << std::endl; diff --git a/cpp/test/test_multi_label.cc b/cpp/test/test_multi_label.cc index a77b9ff0..1795b211 100644 --- a/cpp/test/test_multi_label.cc +++ b/cpp/test/test_multi_label.cc @@ -17,19 +17,19 @@ * under the License. */ -#include <fstream> +#include <arrow/compute/api.h> +#include <cstddef> #include <iostream> -#include <unordered_map> +#include <memory> +#include <ostream> +#include <string> #include "arrow/api.h" -#include "arrow/csv/api.h" -#include "arrow/filesystem/api.h" -#include "arrow/io/api.h" -#include "graphar/fwd.h" +#include "graphar/arrow/chunk_reader.h" +#include "graphar/arrow/chunk_writer.h" +#include "graphar/graph_info.h" #include "parquet/arrow/writer.h" #include "./util.h" -#include "graphar/api/arrow_reader.h" -#include "graphar/api/high_level_writer.h" #include <catch2/catch_test_macros.hpp> @@ -79,7 +79,7 @@ TEST_CASE_METHOD(GlobalFixture, "test_multi_label_builder") { // write arrow table as parquet chunk auto maybe_writer = - VertexPropertyWriter::Make(vertex_info, test_data_dir + "/ldbc/parquet/"); + VertexPropertyWriter::Make(vertex_info, "/tmp/ldbc/parquet/"); REQUIRE(!maybe_writer.has_error()); auto writer = maybe_writer.value(); REQUIRE(writer->WriteTable(table, 0).ok()); @@ -87,7 +87,7 @@ TEST_CASE_METHOD(GlobalFixture, "test_multi_label_builder") { // read label chunk as arrow table auto maybe_reader = VertexPropertyArrowChunkReader::Make( - graph_info, "organisation", labels, SelectType::LABELS); + vertex_info, labels, "/tmp/ldbc/parquet/"); assert(maybe_reader.status().ok()); auto reader = maybe_reader.value(); assert(reader->seek(0).ok()); diff --git a/cpp/test/test_multi_property.cc b/cpp/test/test_multi_property.cc new file mode 100644 index 00000000..f82fc608 --- /dev/null +++ b/cpp/test/test_multi_property.cc @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow/compute/api.h> +#include <cstddef> +#include <iostream> +#include <memory> +#include <ostream> +#include <string> +#include "arrow/api.h" +#include "examples/config.h" +#include "graphar/arrow/chunk_reader.h" +#include "graphar/arrow/chunk_writer.h" +#include "graphar/graph_info.h" +#include "graphar/types.h" +#include "parquet/arrow/writer.h" + +#include "./util.h" + +#include <catch2/catch_test_macros.hpp> + +std::shared_ptr<arrow::Table> read_csv_to_table(const std::string& filename) { + arrow::csv::ReadOptions read_options{}; + arrow::csv::ParseOptions parse_options{}; + arrow::csv::ConvertOptions convert_options{}; + + parse_options.delimiter = '|'; + + auto input = + arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()) + .ValueOrDie(); + + auto reader = arrow::csv::TableReader::Make(arrow::io::default_io_context(), + input, read_options, + parse_options, convert_options) + .ValueOrDie(); + + std::shared_ptr<arrow::Table> table; + table = reader->Read().ValueOrDie(); + + return table; +} + +namespace graphar { +TEST_CASE_METHOD(GlobalFixture, "read from csv file") { + // read labels csv file as arrow table + auto person_table = read_csv_to_table(test_data_dir + "/ldbc/person_0_0.csv"); + auto seed = static_cast<unsigned int>(time(NULL)); + int expected_row = rand_r(&seed) % person_table->num_rows(); + auto person_schema = person_table->schema(); + arrow::MemoryPool* pool = arrow::default_memory_pool(); + auto value_builder = std::make_shared<arrow::StringBuilder>(); + arrow::ListBuilder builder(pool, value_builder); + auto email_col_idx = person_table->schema()->GetFieldIndex("emails"); + std::string expected_emails = + std::static_pointer_cast<arrow::StringArray>( + person_table->column(email_col_idx)->chunk(0)) + ->GetString(expected_row); + for (int64_t chunk_idx = 0; + chunk_idx < person_table->column(email_col_idx)->num_chunks(); + ++chunk_idx) { + auto chunk = person_table->column(email_col_idx)->chunk(chunk_idx); + auto email_column = std::static_pointer_cast<arrow::StringArray>(chunk); + for (int64_t row = 0; row < email_column->length(); ++row) { + auto result = builder.Append(); + ASSERT(result.ok()); + if (email_column->IsValid(row)) { + std::string emails_string = email_column->GetString(row); + auto row_emails = SplitString(emails_string, ';'); + for (const auto& email : row_emails) { + ASSERT(value_builder->Append(email).ok()); + } + } + } + } + std::shared_ptr<arrow::Array> array; + builder.Finish(&array); + auto person_emails_chunked_array = std::make_shared<arrow::ChunkedArray>( + std::vector<std::shared_ptr<arrow::Array>>{array}); + int emailFieldIndex = person_schema->GetFieldIndex("emails"); + person_table = person_table->RemoveColumn(emailFieldIndex).ValueOrDie(); + person_schema = person_schema->RemoveField(emailFieldIndex).ValueOrDie(); + person_schema = + person_schema + ->AddField(person_schema->num_fields(), + arrow::field("emails", arrow::list(arrow::utf8()))) + .ValueOrDie(); + person_table = person_table + ->AddColumn(person_table->num_columns(), + person_schema->fields().back(), + person_emails_chunked_array) + .ValueOrDie(); + auto index = person_schema->GetFieldIndex("emails"); + auto emails_col = person_table->column(index)->chunk(0); + auto result = std::static_pointer_cast<arrow::ListArray>( + emails_col->View(arrow::list(arrow::utf8())).ValueOrDie()); + auto values = std::static_pointer_cast<arrow::StringArray>(result->values()); + int64_t start = result->value_offset(expected_row); + int64_t end = result->value_offset(expected_row + 1); + std::string emails = ""; + for (int64_t i = start; i < end; ++i) { + emails += values->GetString(i); + if (i < end - 1) + emails += ";"; + } + std::cout << "random row: " << expected_row << std::endl; + ASSERT(expected_emails == emails); + // write to parquet file + std::string path = test_data_dir + "/ldbc/parquet/" + "ldbc.graph.yml"; + auto graph_info = graphar::GraphInfo::Load(path).value(); + auto vertex_info = graph_info->GetVertexInfo("person"); + auto maybe_writer = + VertexPropertyWriter::Make(vertex_info, "/tmp/ldbc/parquet/"); + REQUIRE(!maybe_writer.has_error()); + auto writer = maybe_writer.value(); + REQUIRE(writer->WriteTable(person_table, 0).ok()); + REQUIRE(writer->WriteVerticesNum(person_table->num_rows()).ok()); + + auto maybe_reader = VertexPropertyArrowChunkReader::Make( + vertex_info, vertex_info->GetPropertyGroup("emails"), + "/tmp/ldbc/parquet/"); + assert(maybe_reader.status().ok()); + auto reader = maybe_reader.value(); + assert(reader->seek(expected_row).ok()); + auto table_result = reader->GetChunk(); + ASSERT(table_result.status().ok()); + auto table = table_result.value(); + index = table->schema()->GetFieldIndex("emails"); + emails_col = table->column(index)->chunk(0); + result = std::static_pointer_cast<arrow::ListArray>( + emails_col->View(arrow::list(arrow::large_utf8())).ValueOrDie()); + expected_row = expected_row % vertex_info->GetChunkSize(); + auto email_result = + std::static_pointer_cast<arrow::LargeStringArray>(result->value_slice(0)); + emails = ""; + end = email_result->length(); + for (int64_t i = 0; i < end; ++i) { + emails += email_result->GetString(i); + if (i < end - 1) + emails += ";"; + } + std::cout << emails << std::endl; + ASSERT(expected_emails == emails); +} +} // namespace graphar diff --git a/docs/specification/implementation-status.md b/docs/specification/implementation-status.md index 6e634714..6d205ae7 100644 --- a/docs/specification/implementation-status.md +++ b/docs/specification/implementation-status.md @@ -117,6 +117,8 @@ Vertex features: | tag | | | | | | chunk based | ✓ | ✓ | ✓ | ✓ | | property group | ✓ | ✓ | ✓ | ✓ | +| multi-label | ✓ | | ✓ | | +| multi-property | ✓ | | | | :::note --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@graphar.apache.org For additional commands, e-mail: commits-h...@graphar.apache.org