This is an automated email from the ASF dual-hosted git repository.
weibin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-graphar.git
The following commit(s) were added to refs/heads/main by this push:
new a150281c feat(format): Implement the protocol definition of GraphAr
format (#521)
a150281c is described below
commit a150281cc11132a17cd43c04543274fa36468331
Author: Weibin Zeng <[email protected]>
AuthorDate: Fri Jun 14 16:11:43 2024 +0800
feat(format): Implement the protocol definition of GraphAr format (#521)
## Reason for this PR
After implementation and including buf to graphar format definition, it's
time to merge the format develop branch in to main.
## What changes are included in this PR?
- implementation of protocol of GraphAr format definition, there are some
place that diff from the libraries:
- the label naming to type, to distinguish with the multi-label support in
the future.
extend the metadata to the definition, to including the number of
vertex/edge, the chunk number of vertex, the edge
num of each vertex chunk etc. We recorded the metadata with many files
before.
- Use buf to generate code from protobuf.
- Add related CI
## Are these changes tested?
yes
## Are there any user-facing changes?
No
---------
Signed-off-by: acezen <[email protected]>
Co-authored-by: Semyon <[email protected]>
---
.github/workflows/format.yml | 59 ++++++++++++++++++++++++++++++++++++++++++++
buf.gen.yaml | 35 ++++++++++++++++++++++++++
buf.yaml | 20 +++++++++++++++
format/README.md | 20 +++++++++++++++
format/adjacent_list.proto | 43 ++++++++++++++++++++++++++++++++
format/edge_info.proto | 51 ++++++++++++++++++++++++++++++++++++++
format/enums.proto | 53 +++++++++++++++++++++++++++++++++++++++
format/graph_info.proto | 47 +++++++++++++++++++++++++++++++++++
format/property_group.proto | 40 ++++++++++++++++++++++++++++++
format/vertex_info.proto | 41 ++++++++++++++++++++++++++++++
10 files changed, 409 insertions(+)
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
new file mode 100644
index 00000000..0132b98e
--- /dev/null
+++ b/.github/workflows/format.yml
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: GraphAr Format
+
+on:
+ push:
+ branches:
+ - main
+ paths:
+ - 'format/**'
+ - '.github/workflows/format.yml'
+ - 'buf.gen.yaml'
+ - 'buf.yaml'
+ pull_request:
+ branches:
+ - main
+ paths:
+ - 'format/**'
+ - '.github/workflows/format.yml'
+ - 'buf.gen.yaml'
+ - 'buf.yaml'
+
+concurrency:
+ group: ${{ github.repository }}-${{ github.event.number || github.head_ref
|| github.sha }}-${{ github.workflow }}
+ cancel-in-progress: true
+
+jobs:
+ generate:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ repository: ${{ github.event.pull_request.head.repo.full_name }}
+ ref: ${{ github.event.pull_request.head.ref }}
+ submodules: false
+ fetch-depth: 0
+
+ - uses: bufbuild/buf-setup-action@v1
+ with:
+ version: "1.32.0"
+
+ - name: Buf Generate
+ run: buf generate
+
\ No newline at end of file
diff --git a/buf.gen.yaml b/buf.gen.yaml
new file mode 100644
index 00000000..6405baa9
--- /dev/null
+++ b/buf.gen.yaml
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+version: v2
+managed:
+ enabled: true
+ disable:
+ - file_option: java_package
+plugins:
+ # Python classes
+ - remote: buf.build/protocolbuffers/python:v27.1
+ out: pyspark/graphar_pyspark/proto/
+ # Python headers for IDEs and MyPy
+ - remote: buf.build/protocolbuffers/pyi
+ out: pyspark/graphar_pyspark/proto/
+ # Cpp
+ - remote: buf.build/protocolbuffers/cpp:v27.1
+ out: cpp/proto
+ # Java
+ - remote: buf.build/protocolbuffers/java:v27.1
+ out: maven-projects/info/src/main/java/
diff --git a/buf.yaml b/buf.yaml
new file mode 100644
index 00000000..6b16f8b4
--- /dev/null
+++ b/buf.yaml
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+version: v2
+modules:
+ - path: format
diff --git a/format/README.md b/format/README.md
new file mode 100644
index 00000000..5126835b
--- /dev/null
+++ b/format/README.md
@@ -0,0 +1,20 @@
+# GraphAr Format Specification
+
+This folder contains protocol definitions for the GraphAr format.
+
+## How to generate code
+
+### Prerequisites
+
+- [protoc](https://developers.google.com/protocol-buffers/docs/downloads)
+- [buf](https://buf.build/docs/installation) (version >= 1.32.0)
+
+### Build
+
+the build process is managed by `buf` and runs in the root of the repository.
+
+```bash
+buf generate
+```
+
+For documentation about the format, see the [GraphAr
documentation](https://graphar.apache.org/docs/specification/format).
diff --git a/format/adjacent_list.proto b/format/adjacent_list.proto
new file mode 100644
index 00000000..da88ef2a
--- /dev/null
+++ b/format/adjacent_list.proto
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+syntax = "proto3";
+
+package graphar;
+option java_multiple_files = true;
+option java_package = "org.apache.graphar.info.proto";
+
+import "enums.proto";
+
+message AdjacentList {
+ AdjListType type = 1;
+ FileType file_type = 2;
+ string prefix = 3;
+
+ // Statistics message, including
+ // 1. the number of vertices base on the AdjListType
+ // 2. the number of vertex chunks base on the AdjListType
+ // 3. the number of edges of each vertex chunk
+ message Statistics {
+ int64 num_vertices = 1;
+ int64 num_vertex_chunks = 2;
+ repeated int64 edge_nums_of_vertex_chunks = 3;
+ }
+ optional Statistics statistics = 4;
+};
diff --git a/format/edge_info.proto b/format/edge_info.proto
new file mode 100644
index 00000000..12474545
--- /dev/null
+++ b/format/edge_info.proto
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+syntax = "proto3";
+
+package graphar;
+option java_multiple_files = true;
+option java_package = "org.apache.graphar.info.proto";
+
+import "property_group.proto";
+import "adjacent_list.proto";
+
+message EdgeInfo {
+ string type = 1;
+ string source_vertex_type = 2;
+ string destination_vertex_type = 3;
+ int64 chunk_size = 4;
+ int64 source_vertex_chunk_size = 5;
+ int64 destination_vertex_chunk_size = 6;
+ repeated AdjacentList adjacent_list = 7;
+ repeated PropertyGroup properties = 8;
+ bool is_directed = 9;
+ string prefix = 10;
+
+ // Statistics message of the edge, including
+ // 1. num_edges: the number of edges
+ // 2. num_source_vertices: the number of source vertices
+ // 3. num_destination_vertices: the number of destination vertices
+ message Statistics {
+ int64 num_edges = 1;
+ int64 num_source_vertices = 2;
+ int64 num_destination_vertices = 3;
+ }
+ optional Statistics statistics = 11;
+};
diff --git a/format/enums.proto b/format/enums.proto
new file mode 100644
index 00000000..234b9e86
--- /dev/null
+++ b/format/enums.proto
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+syntax = "proto3";
+
+package graphar;
+option java_multiple_files = true;
+option java_package = "org.apache.graphar.info.proto";
+
+enum DataType {
+ BOOL = 0;
+ INT32 = 1;
+ INT64 = 2;
+ FLOAT = 3;
+ DOUBLE = 4;
+ STRING = 5;
+ LIST = 6;
+ DATE = 7;
+ TIMESTAMP = 8;
+ TIME = 9;
+};
+
+enum FileType {
+ CSV = 0;
+ PARQUET = 1;
+ ORC = 2;
+ JSON = 3;
+ AVRO = 4;
+ HDF5 = 5;
+};
+
+enum AdjListType {
+ UNORDERED_BY_SOURCE = 0;
+ UNORDERED_BY_TARGET = 1;
+ ORDERED_BY_SOURCE = 2;
+ ORDERED_BY_TARGET = 3;
+};
diff --git a/format/graph_info.proto b/format/graph_info.proto
new file mode 100644
index 00000000..7d11c956
--- /dev/null
+++ b/format/graph_info.proto
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+syntax = "proto3";
+
+package graphar;
+option java_multiple_files = true;
+option java_package = "org.apache.graphar.info.proto";
+
+import "vertex_info.proto";
+import "edge_info.proto";
+
+message GraphInfo {
+ string name = 1;
+ repeated VertexInfo vertices = 2;
+ repeated EdgeInfo edges = 3;
+ string prefix = 4;
+
+ // Statistics of the graph, including the number of vertices and edges
+ message Statistics {
+ int64 num_vertices = 1;
+ int64 num_edges = 2;
+ }
+ optional Statistics statistics = 5;
+
+ message KeyValue {
+ string key = 1;
+ string value = 2;
+ }
+ repeated KeyValue key_value_metadata = 6;
+};
diff --git a/format/property_group.proto b/format/property_group.proto
new file mode 100644
index 00000000..23c28bcc
--- /dev/null
+++ b/format/property_group.proto
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+syntax = "proto3";
+
+package graphar;
+option java_multiple_files = true;
+option java_package = "org.apache.graphar.info.proto";
+
+import "enums.proto";
+
+message Property {
+ string name = 1;
+ DataType type = 2;
+ bool is_primary_key = 3;
+ bool is_nullable = 4;
+ string prefix = 5;
+};
+
+message PropertyGroup {
+ repeated Property properties = 1;
+ FileType file_type = 2;
+ string prefix = 3;
+};
diff --git a/format/vertex_info.proto b/format/vertex_info.proto
new file mode 100644
index 00000000..99ec482f
--- /dev/null
+++ b/format/vertex_info.proto
@@ -0,0 +1,41 @@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+syntax = "proto3";
+
+package graphar;
+option java_multiple_files = true;
+option java_package = "org.apache.graphar.info.proto";
+
+import "property_group.proto";
+
+message VertexInfo {
+ string type = 1;
+ int64 chunk_size = 2;
+ repeated PropertyGroup properties = 3;
+ string prefix = 4;
+
+ // Statistics message, including the number of vertices and chunks
+ // of this type of vertex
+ message Statistics {
+ int64 num_vertices = 1;
+ int64 num_chunks = 2;
+ }
+ optional Statistics statistics = 5;
+};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]