[GitHub] [flink-ml] yunfengzhou-hub commented on a change in pull request #68: [FLINK-26404] support non-local file systems

GitBox Sun, 06 Mar 2022 19:29:04 -0800


yunfengzhou-hub commented on a change in pull request #68:
URL: https://github.com/apache/flink-ml/pull/68#discussion_r820343180




##########
File path: 
flink-ml-core/src/test/java/org/apache/flink/ml/util/ReadWriteUtilsTest.java
##########
@@ -0,0 +1,92 @@
+package org.apache.flink.ml.util;
+
+import org.apache.flink.api.common.restartstrategy.RestartStrategies;
+import org.apache.flink.core.fs.FileSystem;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.ml.api.*;
+import org.apache.flink.ml.builder.PipelineModel;
+import 
org.apache.flink.streaming.api.environment.ExecutionCheckpointingOptions;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
+import org.apache.flink.test.util.AbstractTestBase;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+public class ReadWriteUtilsTest extends AbstractTestBase {
+    private StreamExecutionEnvironment env;
+    private StreamTableEnvironment tEnv;
+    private File baseDir;
+    private MiniDFSCluster hdfsCluster;
+
+    @Before
+    public void before() throws IOException {
+        org.apache.flink.configuration.Configuration config = new 
org.apache.flink.configuration.Configuration();
+        
config.set(ExecutionCheckpointingOptions.ENABLE_CHECKPOINTS_AFTER_TASKS_FINISH, 
true);
+        env = StreamExecutionEnvironment.getExecutionEnvironment(config);
+        env.setParallelism(4);
+        env.enableCheckpointing(100);
+        env.setRestartStrategy(RestartStrategies.noRestart());
+        tEnv = StreamTableEnvironment.create(env);
+
+        baseDir = 
Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
+        Configuration conf = new Configuration();
+        conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, 
baseDir.getAbsolutePath());
+        MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf);
+        hdfsCluster = builder.build();
+
+    }
+
+    @After
+    public void after() {
+        hdfsCluster.shutdown();
+        FileUtil.fullyDelete(baseDir);

Review comment:
       Flink ML unit tests have been using `org.junit.rules.TemporaryFolder` to 
create directories used in tests, because these directories will be 
automatically deleted when the test cases finished. I recommend that we also 
follow that convention in this PR. You can refer to other tests like 
`KMeansTest` for the usage of this class.

##########
File path: 
flink-ml-core/src/test/java/org/apache/flink/ml/util/ReadWriteUtilsTest.java
##########
@@ -0,0 +1,92 @@
+package org.apache.flink.ml.util;
+
+import org.apache.flink.api.common.restartstrategy.RestartStrategies;
+import org.apache.flink.core.fs.FileSystem;

Review comment:
       Dependencies like this are actually unused in this program. It would be 
better if we could remove such dependencies.

##########
File path: 
flink-ml-core/src/main/java/org/apache/flink/ml/util/ReadWriteUtils.java
##########
@@ -186,7 +174,7 @@ private static String getDataPath(String path) {
     private static String getPathForPipelineStage(int stageIdx, int numStages, 
String parentPath) {
         String format = String.format("%%0%dd", 
String.valueOf(numStages).length());
         String fileName = String.format(format, stageIdx);
-        return Paths.get(parentPath, "stages", fileName).toString();
+        return new Path(parentPath, new Path("stages", fileName)).toString();

Review comment:
       I understand that `Paths.get()` might not work in hdfs and thus should 
be replaced. Do we have an even better choice than `new Path(parentPath, new 
Path("stages", fileName))`? It seems that we would need N-1 `new Path()` in 
order to join N strings, which might not be good enough.

##########
File path: 
flink-ml-core/src/test/java/org/apache/flink/ml/util/ReadWriteUtilsTest.java
##########
@@ -0,0 +1,92 @@
+package org.apache.flink.ml.util;
+
+import org.apache.flink.api.common.restartstrategy.RestartStrategies;
+import org.apache.flink.core.fs.FileSystem;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.ml.api.*;
+import org.apache.flink.ml.builder.PipelineModel;
+import 
org.apache.flink.streaming.api.environment.ExecutionCheckpointingOptions;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
+import org.apache.flink.test.util.AbstractTestBase;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+public class ReadWriteUtilsTest extends AbstractTestBase {

Review comment:
       Javadoc for public classes is required. You can refer to other test 
classes for general conventions of unit tests' documents.

##########
File path: flink-ml-core/pom.xml
##########
@@ -31,6 +31,10 @@ under the License.
   <artifactId>flink-ml-core_${scala.binary.version}</artifactId>
   <name>Flink ML : Core</name>
 
+  <properties>
+    <hadoop.version>2.4.1</hadoop.version>

Review comment:
       I noticed that Hadoop's current latest version is 3.3.x. Why do we add 
support for Hadoop of an older version?

##########
File path: flink-ml-core/pom.xml
##########
@@ -108,5 +112,83 @@ under the License.
       <type>jar</type>
       <scope>test</scope>
     </dependency>
+
+    <!-- hdfs is required for the data cache test -->
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-hdfs</artifactId>
+      <scope>test</scope>
+      <version>${hadoop.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <scope>test</scope>
+      <version>${hadoop.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+        <exclusion>
+          <!-- This dependency is no longer shipped with the JDK since Java 
9.-->
+          <groupId>jdk.tools</groupId>
+          <artifactId>jdk.tools</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-hdfs</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>

Review comment:
       I notices that you added two dependencies for `hadoop-hdfs` and 
`hadoop-common`, and the difference between them is this 
`<type>test-jar</type>`. Could you please illustrate why do we need two 
separate dependencies? I tried removing either of them, and the resulting 
project can still execute the unit tests.

##########
File path: flink-ml-core/pom.xml
##########
@@ -108,5 +112,83 @@ under the License.
       <type>jar</type>
       <scope>test</scope>
     </dependency>
+
+    <!-- hdfs is required for the data cache test -->
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-hdfs</artifactId>
+      <scope>test</scope>
+      <version>${hadoop.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <scope>test</scope>
+      <version>${hadoop.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+        <exclusion>
+          <!-- This dependency is no longer shipped with the JDK since Java 
9.-->
+          <groupId>jdk.tools</groupId>
+          <artifactId>jdk.tools</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-hdfs</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+      <version>${hadoop.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+      <version>${hadoop.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>log4j</groupId>
+          <artifactId>log4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+        <exclusion>
+          <!-- This dependency is no longer shipped with the JDK since Java 
9.-->
+          <groupId>jdk.tools</groupId>
+          <artifactId>jdk.tools</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>    <dependency>

Review comment:
       nit: split them into different lines.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [flink-ml] yunfengzhou-hub commented on a change in pull request #68: [FLINK-26404] support non-local file systems

Reply via email to