[tinkerpop] branch 3.5-dev updated: [TINKERPOP-2941] Stop deleting the output location if it is not empty

valentyn Thu, 25 May 2023 16:56:28 -0700

This is an automated email from the ASF dual-hosted git repository.

valentyn pushed a commit to branch 3.5-dev
in repository https://gitbox.apache.org/repos/asf/tinkerpop.git



The following commit(s) were added to refs/heads/3.5-dev by this push:
     new f27278d24c [TINKERPOP-2941] Stop deleting the output location if it is 
not empty
     new d8cff12eb0 Merge pull request #2053 from ministat/hj_TINKERPOP-2941
f27278d24c is described below

commit f27278d24c6d4868cff70109897e238be1c68a82
Author: Hongjiang Zhang <[email protected]>
AuthorDate: Thu May 4 12:20:22 2023 +0800

    [TINKERPOP-2941] Stop deleting the output location if it is not empty
    
    In production environment, it is dangerous to delete a folder which is
    not empty. In order to avoid any wrong operations, it is better ask user
    to delete it manually if it is not empty.
---
 CHANGELOG.asciidoc                                 |  1 +
 .../apache/tinkerpop/gremlin/hadoop/Constants.java |  1 +
 .../spark/process/computer/SparkGraphComputer.java | 16 +++++++-
 .../process/computer/SparkHadoopGraphProvider.java | 10 ++++-
 .../SparkCloneVertexProgramInterceptorTest.java    | 46 ++++++++++++++++++++++
 5 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc
index cd68844925..530b05f107 100644
--- a/CHANGELOG.asciidoc
+++ b/CHANGELOG.asciidoc
@@ -24,6 +24,7 @@ 
image::https://raw.githubusercontent.com/apache/tinkerpop/master/docs/static/ima
 === TinkerPop 3.5.7 (Release Date: NOT OFFICIALLY RELEASED YET)
 
 * Fixed a memory leak in the Gremlin.Net driver that only occurred if a 
CancellationToken was provided.
+* Added `gremlin.spark.dontDeleteNonEmptyOutput` to stop deleting the output 
folder if it is not empty in `spark-gremlin`.
 
 [[release-3-5-6]]
 === TinkerPop 3.5.6 (Release Date: May 1, 2023)
diff --git 
a/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/Constants.java
 
b/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/Constants.java
index 7519b8ffd7..eb2f94cd53 100644
--- 
a/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/Constants.java
+++ 
b/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/Constants.java
@@ -73,6 +73,7 @@ public final class Constants {
     public static final String GREMLIN_SPARK_PERSIST_STORAGE_LEVEL = 
"gremlin.spark.persistStorageLevel";
     public static final String GREMLIN_SPARK_SKIP_PARTITIONER = 
"gremlin.spark.skipPartitioner"; // don't partition the loadedGraphRDD
     public static final String GREMLIN_SPARK_SKIP_GRAPH_CACHE = 
"gremlin.spark.skipGraphCache";  // don't cache the loadedGraphRDD (ignores 
graphStorageLevel)
+    public static final String GREMLIN_SPARK_DONT_DELETE_NON_EMPTY_OUTPUT = 
"gremlin.spark.dontDeleteNonEmptyOutput"; // don't delete the output if it is 
not empty
     public static final String SPARK_SERIALIZER = "spark.serializer";
     public static final String SPARK_KRYO_REGISTRATOR = 
"spark.kryo.registrator";
     public static final String SPARK_KRYO_REGISTRATION_REQUIRED = 
"spark.kryo.registrationRequired";
diff --git 
a/spark-gremlin/src/main/java/org/apache/tinkerpop/gremlin/spark/process/computer/SparkGraphComputer.java
 
b/spark-gremlin/src/main/java/org/apache/tinkerpop/gremlin/spark/process/computer/SparkGraphComputer.java
index 62171aed2d..5f455837eb 100644
--- 
a/spark-gremlin/src/main/java/org/apache/tinkerpop/gremlin/spark/process/computer/SparkGraphComputer.java
+++ 
b/spark-gremlin/src/main/java/org/apache/tinkerpop/gremlin/spark/process/computer/SparkGraphComputer.java
@@ -315,10 +315,22 @@ public final class SparkGraphComputer extends 
AbstractHadoopGraphComputer {
 
             SparkMemory memory = null;
             // delete output location
+            final boolean dontDeleteNonEmptyOutput =
+                    
graphComputerConfiguration.getBoolean(Constants.GREMLIN_SPARK_DONT_DELETE_NON_EMPTY_OUTPUT,
 false);
             final String outputLocation = 
hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, null);
             if (null != outputLocation) {
-                if (outputToHDFS && fileSystemStorage.exists(outputLocation))
-                    fileSystemStorage.rm(outputLocation);
+                if (outputToHDFS && fileSystemStorage.exists(outputLocation)) {
+                    if (dontDeleteNonEmptyOutput) {
+                        // DON'T delete the content if the folder is not empty
+                        if (fileSystemStorage.ls(outputLocation).size() == 0) {
+                            fileSystemStorage.rm(outputLocation);
+                        } else {
+                            throw new IllegalStateException("The output 
location '" + outputLocation + "' is not empty");
+                        }
+                    } else {
+                        fileSystemStorage.rm(outputLocation);
+                    }
+                }
                 if (outputToSpark && 
sparkContextStorage.exists(outputLocation))
                     sparkContextStorage.rm(outputLocation);
             }
diff --git 
a/spark-gremlin/src/test/java/org/apache/tinkerpop/gremlin/spark/process/computer/SparkHadoopGraphProvider.java
 
b/spark-gremlin/src/test/java/org/apache/tinkerpop/gremlin/spark/process/computer/SparkHadoopGraphProvider.java
index 6c1efb1af5..c428081fd3 100644
--- 
a/spark-gremlin/src/test/java/org/apache/tinkerpop/gremlin/spark/process/computer/SparkHadoopGraphProvider.java
+++ 
b/spark-gremlin/src/test/java/org/apache/tinkerpop/gremlin/spark/process/computer/SparkHadoopGraphProvider.java
@@ -30,6 +30,7 @@ import 
org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph;
 import org.apache.tinkerpop.gremlin.hadoop.structure.HadoopProperty;
 import org.apache.tinkerpop.gremlin.hadoop.structure.HadoopVertex;
 import org.apache.tinkerpop.gremlin.hadoop.structure.HadoopVertexProperty;
+import org.apache.tinkerpop.gremlin.hadoop.structure.io.FileSystemStorage;
 import org.apache.tinkerpop.gremlin.hadoop.structure.io.FileSystemStorageCheck;
 import org.apache.tinkerpop.gremlin.hadoop.structure.io.HadoopPools;
 import 
org.apache.tinkerpop.gremlin.hadoop.structure.io.graphson.GraphSONInputFormat;
@@ -108,7 +109,14 @@ public class SparkHadoopGraphProvider extends 
AbstractFileGraphProvider {
             put(Graph.GRAPH, HadoopGraph.class.getName());
             put(Constants.GREMLIN_HADOOP_GRAPH_READER, graphSONInput ? 
GraphSONInputFormat.class.getCanonicalName() : 
GryoInputFormat.class.getCanonicalName());
             put(Constants.GREMLIN_HADOOP_GRAPH_WRITER, 
GryoOutputFormat.class.getCanonicalName());
-            put(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, 
getWorkingDirectory());
+            // clear the output location if it is not empty
+            String outputLocation = getWorkingDirectory();
+            FileSystemStorage fileSystemStorage = FileSystemStorage.open();
+            if (fileSystemStorage.ls(outputLocation).size() > 0) {
+                fileSystemStorage.rm(outputLocation);
+                outputLocation = getWorkingDirectory();
+            }
+            put(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, outputLocation);
             put(Constants.GREMLIN_HADOOP_JARS_IN_DISTRIBUTED_CACHE, false);
 
             put(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, true);  // this makes 
the test suite go really fast
diff --git 
a/spark-gremlin/src/test/java/org/apache/tinkerpop/gremlin/spark/process/computer/traversal/strategy/optimization/interceptor/SparkCloneVertexProgramInterceptorTest.java
 
b/spark-gremlin/src/test/java/org/apache/tinkerpop/gremlin/spark/process/computer/traversal/strategy/optimization/interceptor/SparkCloneVertexProgramInterceptorTest.java
index cd57d42588..3ac436970e 100644
--- 
a/spark-gremlin/src/test/java/org/apache/tinkerpop/gremlin/spark/process/computer/traversal/strategy/optimization/interceptor/SparkCloneVertexProgramInterceptorTest.java
+++ 
b/spark-gremlin/src/test/java/org/apache/tinkerpop/gremlin/spark/process/computer/traversal/strategy/optimization/interceptor/SparkCloneVertexProgramInterceptorTest.java
@@ -94,4 +94,50 @@ public class SparkCloneVertexProgramInterceptorTest extends 
AbstractSparkTest {
         assertEquals(totalV, totalVRef);
         assertEquals(totalE, totalERef);
     }
+
+    @Test
+    public void shouldStopPurgingOfExistingNonEmptyFolder() throws Exception {
+        // Build the random graph
+        final TinkerGraph randomGraph = TinkerGraph.open();
+        final int totalVertices = 200000;
+        TestHelper.createRandomGraph(randomGraph, totalVertices, 100);
+        final String inputLocation = 
TestHelper.makeTestDataFile(GryoSerializerIntegrateTest.class,
+                UUID.randomUUID().toString(),
+                "random-graph.kryo");
+        randomGraph.io(IoCore.gryo()).writeGraph(inputLocation);
+        randomGraph.clear();
+        randomGraph.close();
+
+        // Serialize the graph to disk by CloneVertexProgram + 
SparkGraphComputer
+        final String outputLocation = 
TestHelper.makeTestDataDirectory(GryoSerializerIntegrateTest.class, 
UUID.randomUUID().toString());
+        Configuration configuration1 = getBaseConfiguration();
+        configuration1.clearProperty(Constants.SPARK_SERIALIZER); // ensure 
proper default to GryoSerializer
+        configuration1.setProperty(Constants.GREMLIN_HADOOP_INPUT_LOCATION, 
inputLocation);
+        configuration1.setProperty(Constants.GREMLIN_HADOOP_GRAPH_READER, 
GryoInputFormat.class.getCanonicalName());
+        configuration1.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER, 
GryoOutputFormat.class.getCanonicalName());
+        configuration1.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, 
outputLocation);
+        configuration1.setProperty(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, 
false);
+        Graph graph = GraphFactory.open(configuration1);
+        
graph.compute(SparkGraphComputer.class).program(CloneVertexProgram.build().create()).submit().get();
+
+        // Read the total Vertex/Edge count for golden reference through the 
original graph
+        Configuration configuration2 = getBaseConfiguration();
+        configuration2.clearProperty(Constants.SPARK_SERIALIZER); // ensure 
proper default to GryoSerializer
+        configuration2.setProperty(Constants.GREMLIN_HADOOP_INPUT_LOCATION, 
inputLocation);
+        configuration2.setProperty(Constants.GREMLIN_HADOOP_GRAPH_READER, 
GryoInputFormat.class.getCanonicalName());
+        configuration2.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER, 
NullOutputFormat.class.getCanonicalName());
+        configuration2.setProperty(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, 
false);
+        
configuration2.setProperty(Constants.GREMLIN_SPARK_DONT_DELETE_NON_EMPTY_OUTPUT,
 true);
+        graph = GraphFactory.open(configuration2);
+        long totalVRef = 
graph.traversal().withComputer(SparkGraphComputer.class).V().count().next().longValue();
+
+        assertEquals(totalVRef, totalVertices);
+        // Should see exception if reuse the previous outputLocation which is 
not empty
+        graph = GraphFactory.open(configuration1);
+        try {
+            
graph.traversal().withComputer(SparkGraphComputer.class).E().count().next().longValue();
+        } catch (Exception e) {
+            assertTrue(e.getMessage().contains("is not empty"));
+        }
+    }
 }

[tinkerpop] branch 3.5-dev updated: [TINKERPOP-2941] Stop deleting the output location if it is not empty

Reply via email to