(incubator-graphar) branch main updated: fix(spark): Fix the LdbcSample2GraphAr result and use csv as output (#528)

weibin Thu, 20 Jun 2024 00:39:17 -0700

This is an automated email from the ASF dual-hosted git repository.

weibin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-graphar.git



The following commit(s) were added to refs/heads/main by this push:
     new d6ce836a fix(spark): Fix the LdbcSample2GraphAr result and use csv as 
output (#528)
d6ce836a is described below

commit d6ce836a3d53843af1d1098d920a35ffca07880b
Author: Weibin Zeng <[email protected]>
AuthorDate: Thu Jun 20 15:38:56 2024 +0800

    fix(spark): Fix the LdbcSample2GraphAr result and use csv as output (#528)
    
    
    
    Signed-off-by: acezen <[email protected]>
---
 .../org/apache/graphar/example/LdbcSample2GraphAr.scala   | 15 +++------------
 .../src/main/scala/org/apache/graphar/util/Utils.scala    | 11 ++++++-----
 maven-projects/spark/scripts/run-ldbc-sample2graphar.sh   |  2 +-
 3 files changed, 10 insertions(+), 18 deletions(-)

diff --git 
a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/example/LdbcSample2GraphAr.scala
 
b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/example/LdbcSample2GraphAr.scala
index fe4a16b9..eb5a63f0 100644
--- 
a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/example/LdbcSample2GraphAr.scala
+++ 
b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/example/LdbcSample2GraphAr.scala
@@ -92,23 +92,14 @@ object LdbcSample2GraphAr {
     writer.PutVertexData("Person", person_df)
 
     // read edges with type "Person"->"Knows"->"Person" from given path as a 
DataFrame
-    // FIXME(@acezen): the schema should be inferred from the data, but 
graphar spark
-    // library does not support timestamp type yet
-    val schema = StructType(
-      Array(
-        StructField("src", IntegerType, true),
-        StructField("dst", IntegerType, true),
-        StructField("creationDate", StringType, true)
-      )
-    )
-    val produced_edge_df = spark.read
+    val knows_edge_df = spark.read
       .option("delimiter", "|")
       .option("header", "true")
-      .schema(schema)
+      .option("inferSchema", "true")
       .format("csv")
       .load(personKnowsPersonInputPath)
     // put into writer, source vertex label is "Person", edge label is "Knows"
     // target vertex label is "Person"
-    writer.PutEdgeData(("Person", "Knows", "Person"), produced_edge_df)
+    writer.PutEdgeData(("Person", "Knows", "Person"), knows_edge_df)
   }
 }
diff --git 
a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/util/Utils.scala
 
b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/util/Utils.scala
index 2c0b1e8e..a85f7ee5 100644
--- 
a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/util/Utils.scala
+++ 
b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/util/Utils.scala
@@ -56,11 +56,12 @@ object Utils {
   def sparkDataType2GraphArTypeName(dataType: DataType): String = {
     val typeName = dataType.typeName
     val grapharTypeName = typeName match {
-      case "string"  => "string"
-      case "integer" => "int"
-      case "long"    => "int64"
-      case "double"  => "double"
-      case "boolean" => "bool"
+      case "string"    => "string"
+      case "integer"   => "int"
+      case "long"      => "int64"
+      case "double"    => "double"
+      case "boolean"   => "bool"
+      case "timestamp" => "timestamp"
       case _ =>
         throw new IllegalArgumentException(
           "Expected string, integral, double or boolean type, got " + typeName 
+ " type"
diff --git a/maven-projects/spark/scripts/run-ldbc-sample2graphar.sh 
b/maven-projects/spark/scripts/run-ldbc-sample2graphar.sh
index 42f55552..d6b268f1 100755
--- a/maven-projects/spark/scripts/run-ldbc-sample2graphar.sh
+++ b/maven-projects/spark/scripts/run-ldbc-sample2graphar.sh
@@ -28,6 +28,6 @@ output_dir="/tmp/graphar/ldbc_sample"
 
 vertex_chunk_size=100
 edge_chunk_size=1024
-file_type="parquet"
+file_type="csv"
 spark-submit --class org.apache.graphar.example.LdbcSample2GraphAr ${jar_file} 
\
     ${person_input_file} ${person_knows_person_input_file} ${output_dir} 
${vertex_chunk_size} ${edge_chunk_size} ${file_type}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-graphar) branch main updated: fix(spark): Fix the LdbcSample2GraphAr result and use csv as output (#528)

Reply via email to