This is an automated email from the ASF dual-hosted git repository.
weibin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-graphar.git
The following commit(s) were added to refs/heads/main by this push:
new d6ce836a fix(spark): Fix the LdbcSample2GraphAr result and use csv as
output (#528)
d6ce836a is described below
commit d6ce836a3d53843af1d1098d920a35ffca07880b
Author: Weibin Zeng <[email protected]>
AuthorDate: Thu Jun 20 15:38:56 2024 +0800
fix(spark): Fix the LdbcSample2GraphAr result and use csv as output (#528)
Signed-off-by: acezen <[email protected]>
---
.../org/apache/graphar/example/LdbcSample2GraphAr.scala | 15 +++------------
.../src/main/scala/org/apache/graphar/util/Utils.scala | 11 ++++++-----
maven-projects/spark/scripts/run-ldbc-sample2graphar.sh | 2 +-
3 files changed, 10 insertions(+), 18 deletions(-)
diff --git
a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/example/LdbcSample2GraphAr.scala
b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/example/LdbcSample2GraphAr.scala
index fe4a16b9..eb5a63f0 100644
---
a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/example/LdbcSample2GraphAr.scala
+++
b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/example/LdbcSample2GraphAr.scala
@@ -92,23 +92,14 @@ object LdbcSample2GraphAr {
writer.PutVertexData("Person", person_df)
// read edges with type "Person"->"Knows"->"Person" from given path as a
DataFrame
- // FIXME(@acezen): the schema should be inferred from the data, but
graphar spark
- // library does not support timestamp type yet
- val schema = StructType(
- Array(
- StructField("src", IntegerType, true),
- StructField("dst", IntegerType, true),
- StructField("creationDate", StringType, true)
- )
- )
- val produced_edge_df = spark.read
+ val knows_edge_df = spark.read
.option("delimiter", "|")
.option("header", "true")
- .schema(schema)
+ .option("inferSchema", "true")
.format("csv")
.load(personKnowsPersonInputPath)
// put into writer, source vertex label is "Person", edge label is "Knows"
// target vertex label is "Person"
- writer.PutEdgeData(("Person", "Knows", "Person"), produced_edge_df)
+ writer.PutEdgeData(("Person", "Knows", "Person"), knows_edge_df)
}
}
diff --git
a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/util/Utils.scala
b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/util/Utils.scala
index 2c0b1e8e..a85f7ee5 100644
---
a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/util/Utils.scala
+++
b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/util/Utils.scala
@@ -56,11 +56,12 @@ object Utils {
def sparkDataType2GraphArTypeName(dataType: DataType): String = {
val typeName = dataType.typeName
val grapharTypeName = typeName match {
- case "string" => "string"
- case "integer" => "int"
- case "long" => "int64"
- case "double" => "double"
- case "boolean" => "bool"
+ case "string" => "string"
+ case "integer" => "int"
+ case "long" => "int64"
+ case "double" => "double"
+ case "boolean" => "bool"
+ case "timestamp" => "timestamp"
case _ =>
throw new IllegalArgumentException(
"Expected string, integral, double or boolean type, got " + typeName
+ " type"
diff --git a/maven-projects/spark/scripts/run-ldbc-sample2graphar.sh
b/maven-projects/spark/scripts/run-ldbc-sample2graphar.sh
index 42f55552..d6b268f1 100755
--- a/maven-projects/spark/scripts/run-ldbc-sample2graphar.sh
+++ b/maven-projects/spark/scripts/run-ldbc-sample2graphar.sh
@@ -28,6 +28,6 @@ output_dir="/tmp/graphar/ldbc_sample"
vertex_chunk_size=100
edge_chunk_size=1024
-file_type="parquet"
+file_type="csv"
spark-submit --class org.apache.graphar.example.LdbcSample2GraphAr ${jar_file}
\
${person_input_file} ${person_knows_person_input_file} ${output_dir}
${vertex_chunk_size} ${edge_chunk_size} ${file_type}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]