tillrohrmann commented on a change in pull request #11920:
URL: https://github.com/apache/flink/pull/11920#discussion_r426163109



##########
File path: 
flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java
##########
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.externalresource.gpu;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.api.common.externalresource.ExternalResourceDriver;
+import org.apache.flink.configuration.ConfigConstants;
+import org.apache.flink.configuration.ConfigOption;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ExternalResourceOptions;
+import org.apache.flink.configuration.IllegalConfigurationException;
+import org.apache.flink.util.FlinkException;
+import org.apache.flink.util.Preconditions;
+import org.apache.flink.util.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import static org.apache.flink.configuration.ConfigOptions.key;
+
+/**
+ * Driver takes the responsibility to discover GPU resources and provide the 
GPU resource information.
+ * It get the GPU information by executing user-defined discovery script.

Review comment:
       ```suggestion
    * It retrieves the GPU information by executing a user-defined discovery 
script.
   ```

##########
File path: flink-dist/src/main/assemblies/opt.xml
##########
@@ -75,6 +75,28 @@
                        <fileMode>0644</fileMode>
                </file>
 
+               <!-- External Resource -->
+               <file>
+                       
<source>../flink-external-resources/flink-external-resource-gpu/target/flink-external-resource-gpu-${project.version}.jar</source>
+                       
<outputDirectory>opt/external-resource-gpu/</outputDirectory>
+                       
<destName>flink-external-resource-gpu-${project.version}.jar</destName>
+                       <fileMode>0644</fileMode>
+               </file>
+
+               <file>
+                       
<source>../flink-external-resources/flink-external-resource-gpu/src/main/resources/gpu-discovery-common.sh</source>
+                       
<outputDirectory>opt/external-resource-gpu/</outputDirectory>
+                       <destName>gpu-discovery-common.sh</destName>
+                       <fileMode>0755</fileMode>
+               </file>
+
+               <file>
+                       
<source>../flink-external-resources/flink-external-resource-gpu/src/main/resources/nvidia-gpu-discovery.sh</source>
+                       
<outputDirectory>opt/external-resource-gpu/</outputDirectory>
+                       <destName>nvidia-gpu-discovery.sh</destName>
+                       <fileMode>0755</fileMode>
+               </file>

Review comment:
       I'm wondering whether we should add the external gpu support right away 
to the Flink binary distribution or whether it would also be ok to download it 
from the Flink web page? How large are the binaries we are adding here?

##########
File path: 
flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java
##########
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.externalresource.gpu;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.api.common.externalresource.ExternalResourceDriver;
+import org.apache.flink.configuration.ConfigConstants;
+import org.apache.flink.configuration.ConfigOption;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ExternalResourceOptions;
+import org.apache.flink.configuration.IllegalConfigurationException;
+import org.apache.flink.util.FlinkException;
+import org.apache.flink.util.Preconditions;
+import org.apache.flink.util.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import static org.apache.flink.configuration.ConfigOptions.key;
+
+/**
+ * Driver takes the responsibility to discover GPU resources and provide the 
GPU resource information.
+ * It get the GPU information by executing user-defined discovery script.
+ */
+class GPUDriver implements ExternalResourceDriver {
+
+       private static final Logger LOG = 
LoggerFactory.getLogger(GPUDriver.class);
+
+       private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000;
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_PATH =
+               key("discovery-script.path")
+                       .stringType()
+                       
.defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", 
ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS));
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_ARG =
+               key("discovery-script.args")
+                       .stringType()
+                       .noDefaultValue();
+
+       private final File discoveryScriptFile;
+       private final String args;
+
+       GPUDriver(Configuration config) throws Exception {
+               final String discoveryScriptPathStr = 
config.getString(DISCOVERY_SCRIPT_PATH);
+               if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) 
{
+                       throw new IllegalConfigurationException(
+                               String.format("GPU discovery script ('%s') is 
not configured.", 
ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key())));
+               }
+
+               Path discoveryScriptPath = Paths.get(discoveryScriptPathStr);
+               if (!discoveryScriptPath.isAbsolute()) {
+                       discoveryScriptPath = 
Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, 
"."), discoveryScriptPathStr);
+               }
+               discoveryScriptFile = discoveryScriptPath.toFile();
+
+               if (!discoveryScriptFile.exists()) {
+                       throw new FileNotFoundException(String.format("The gpu 
discovery script does not exist in path %s.", 
discoveryScriptFile.getAbsolutePath()));
+               }
+               if (!discoveryScriptFile.canExecute()) {
+                       throw new FlinkException(String.format("The discovery 
script %s is not executable.", discoveryScriptFile.getAbsolutePath()));
+               }
+
+               args = config.getString(DISCOVERY_SCRIPT_ARG);
+       }
+
+       @Override
+       public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws 
Exception {
+               Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount 
should be positive while finding " + gpuAmount);
+
+               final Set<GPUInfo> gpuResources = new HashSet<>();
+               String output = executeDiscoveryScript(discoveryScriptFile, 
gpuAmount, args);
+               if (!output.isEmpty()) {
+                       String[] indexes = output.split(",");
+                       for (String index : indexes) {
+                               if (!StringUtils.isNullOrWhitespaceOnly(index)) 
{
+                                       gpuResources.add(new 
GPUInfo(index.trim()));
+                               }
+                       }
+               }
+               LOG.info("Discover GPU resources: {}.", gpuResources);
+               return Collections.unmodifiableSet(gpuResources);
+       }
+
+       private String executeDiscoveryScript(File discoveryScript, long 
gpuAmount, String args) throws Exception {
+               final String cmd = discoveryScript.getAbsolutePath() + " " + 
gpuAmount + " " + args;
+               final Process process = Runtime.getRuntime().exec(cmd);
+               final BufferedReader stdoutReader = new BufferedReader(new 
InputStreamReader(process.getInputStream()));
+               final BufferedReader stderrReader = new BufferedReader(new 
InputStreamReader(process.getErrorStream()));

Review comment:
       I would suggest to use try-with-resources to create the `BufferedReaders`
   
   ```
   try (BufferedReader stdoutReader = new BufferedReader(); BufferedReader 
stderrReader = new BufferedReader()) {
   
   }
   ```

##########
File path: 
flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java
##########
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.externalresource.gpu;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.api.common.externalresource.ExternalResourceDriver;
+import org.apache.flink.configuration.ConfigConstants;
+import org.apache.flink.configuration.ConfigOption;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ExternalResourceOptions;
+import org.apache.flink.configuration.IllegalConfigurationException;
+import org.apache.flink.util.FlinkException;
+import org.apache.flink.util.Preconditions;
+import org.apache.flink.util.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import static org.apache.flink.configuration.ConfigOptions.key;
+
+/**
+ * Driver takes the responsibility to discover GPU resources and provide the 
GPU resource information.
+ * It get the GPU information by executing user-defined discovery script.
+ */
+class GPUDriver implements ExternalResourceDriver {
+
+       private static final Logger LOG = 
LoggerFactory.getLogger(GPUDriver.class);
+
+       private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000;
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_PATH =
+               key("discovery-script.path")
+                       .stringType()
+                       
.defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", 
ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS));
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_ARG =
+               key("discovery-script.args")
+                       .stringType()
+                       .noDefaultValue();
+
+       private final File discoveryScriptFile;
+       private final String args;
+
+       GPUDriver(Configuration config) throws Exception {
+               final String discoveryScriptPathStr = 
config.getString(DISCOVERY_SCRIPT_PATH);
+               if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) 
{
+                       throw new IllegalConfigurationException(
+                               String.format("GPU discovery script ('%s') is 
not configured.", 
ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key())));
+               }
+
+               Path discoveryScriptPath = Paths.get(discoveryScriptPathStr);
+               if (!discoveryScriptPath.isAbsolute()) {
+                       discoveryScriptPath = 
Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, 
"."), discoveryScriptPathStr);
+               }
+               discoveryScriptFile = discoveryScriptPath.toFile();
+
+               if (!discoveryScriptFile.exists()) {
+                       throw new FileNotFoundException(String.format("The gpu 
discovery script does not exist in path %s.", 
discoveryScriptFile.getAbsolutePath()));
+               }
+               if (!discoveryScriptFile.canExecute()) {
+                       throw new FlinkException(String.format("The discovery 
script %s is not executable.", discoveryScriptFile.getAbsolutePath()));
+               }
+
+               args = config.getString(DISCOVERY_SCRIPT_ARG);
+       }
+
+       @Override
+       public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws 
Exception {
+               Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount 
should be positive while finding " + gpuAmount);
+
+               final Set<GPUInfo> gpuResources = new HashSet<>();
+               String output = executeDiscoveryScript(discoveryScriptFile, 
gpuAmount, args);
+               if (!output.isEmpty()) {
+                       String[] indexes = output.split(",");
+                       for (String index : indexes) {
+                               if (!StringUtils.isNullOrWhitespaceOnly(index)) 
{
+                                       gpuResources.add(new 
GPUInfo(index.trim()));
+                               }
+                       }
+               }
+               LOG.info("Discover GPU resources: {}.", gpuResources);
+               return Collections.unmodifiableSet(gpuResources);
+       }
+
+       private String executeDiscoveryScript(File discoveryScript, long 
gpuAmount, String args) throws Exception {
+               final String cmd = discoveryScript.getAbsolutePath() + " " + 
gpuAmount + " " + args;
+               final Process process = Runtime.getRuntime().exec(cmd);

Review comment:
       We should add a finally block to the end of the try-with-resources block 
which create the `BufferedReaders` and make sure that `process` is being 
destroyed. We can do this by calling `Process.destroyForcibly()`.

##########
File path: 
flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java
##########
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.externalresource.gpu;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.api.common.externalresource.ExternalResourceDriver;
+import org.apache.flink.configuration.ConfigConstants;
+import org.apache.flink.configuration.ConfigOption;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ExternalResourceOptions;
+import org.apache.flink.configuration.IllegalConfigurationException;
+import org.apache.flink.util.FlinkException;
+import org.apache.flink.util.Preconditions;
+import org.apache.flink.util.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import static org.apache.flink.configuration.ConfigOptions.key;
+
+/**
+ * Driver takes the responsibility to discover GPU resources and provide the 
GPU resource information.
+ * It get the GPU information by executing user-defined discovery script.
+ */
+class GPUDriver implements ExternalResourceDriver {
+
+       private static final Logger LOG = 
LoggerFactory.getLogger(GPUDriver.class);
+
+       private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000;
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_PATH =
+               key("discovery-script.path")
+                       .stringType()
+                       
.defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", 
ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS));
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_ARG =
+               key("discovery-script.args")
+                       .stringType()
+                       .noDefaultValue();
+
+       private final File discoveryScriptFile;
+       private final String args;
+
+       GPUDriver(Configuration config) throws Exception {
+               final String discoveryScriptPathStr = 
config.getString(DISCOVERY_SCRIPT_PATH);
+               if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) 
{
+                       throw new IllegalConfigurationException(
+                               String.format("GPU discovery script ('%s') is 
not configured.", 
ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key())));
+               }
+
+               Path discoveryScriptPath = Paths.get(discoveryScriptPathStr);
+               if (!discoveryScriptPath.isAbsolute()) {
+                       discoveryScriptPath = 
Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, 
"."), discoveryScriptPathStr);
+               }
+               discoveryScriptFile = discoveryScriptPath.toFile();
+
+               if (!discoveryScriptFile.exists()) {
+                       throw new FileNotFoundException(String.format("The gpu 
discovery script does not exist in path %s.", 
discoveryScriptFile.getAbsolutePath()));
+               }
+               if (!discoveryScriptFile.canExecute()) {
+                       throw new FlinkException(String.format("The discovery 
script %s is not executable.", discoveryScriptFile.getAbsolutePath()));
+               }
+
+               args = config.getString(DISCOVERY_SCRIPT_ARG);
+       }
+
+       @Override
+       public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws 
Exception {
+               Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount 
should be positive while finding " + gpuAmount);

Review comment:
       ```suggestion
                Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount 
should be positive when retrieving the GPU resource information.");
   ```

##########
File path: 
flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java
##########
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.externalresource.gpu;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.api.common.externalresource.ExternalResourceDriver;
+import org.apache.flink.configuration.ConfigConstants;
+import org.apache.flink.configuration.ConfigOption;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ExternalResourceOptions;
+import org.apache.flink.configuration.IllegalConfigurationException;
+import org.apache.flink.util.FlinkException;
+import org.apache.flink.util.Preconditions;
+import org.apache.flink.util.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import static org.apache.flink.configuration.ConfigOptions.key;
+
+/**
+ * Driver takes the responsibility to discover GPU resources and provide the 
GPU resource information.
+ * It get the GPU information by executing user-defined discovery script.
+ */
+class GPUDriver implements ExternalResourceDriver {
+
+       private static final Logger LOG = 
LoggerFactory.getLogger(GPUDriver.class);
+
+       private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000;
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_PATH =
+               key("discovery-script.path")
+                       .stringType()
+                       
.defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", 
ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS));
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_ARG =
+               key("discovery-script.args")
+                       .stringType()
+                       .noDefaultValue();
+
+       private final File discoveryScriptFile;
+       private final String args;
+
+       GPUDriver(Configuration config) throws Exception {
+               final String discoveryScriptPathStr = 
config.getString(DISCOVERY_SCRIPT_PATH);
+               if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) 
{
+                       throw new IllegalConfigurationException(
+                               String.format("GPU discovery script ('%s') is 
not configured.", 
ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key())));
+               }
+
+               Path discoveryScriptPath = Paths.get(discoveryScriptPathStr);
+               if (!discoveryScriptPath.isAbsolute()) {
+                       discoveryScriptPath = 
Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, 
"."), discoveryScriptPathStr);
+               }
+               discoveryScriptFile = discoveryScriptPath.toFile();
+
+               if (!discoveryScriptFile.exists()) {
+                       throw new FileNotFoundException(String.format("The gpu 
discovery script does not exist in path %s.", 
discoveryScriptFile.getAbsolutePath()));
+               }
+               if (!discoveryScriptFile.canExecute()) {
+                       throw new FlinkException(String.format("The discovery 
script %s is not executable.", discoveryScriptFile.getAbsolutePath()));
+               }
+
+               args = config.getString(DISCOVERY_SCRIPT_ARG);
+       }
+
+       @Override
+       public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws 
Exception {
+               Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount 
should be positive while finding " + gpuAmount);
+
+               final Set<GPUInfo> gpuResources = new HashSet<>();
+               String output = executeDiscoveryScript(discoveryScriptFile, 
gpuAmount, args);
+               if (!output.isEmpty()) {
+                       String[] indexes = output.split(",");
+                       for (String index : indexes) {
+                               if (!StringUtils.isNullOrWhitespaceOnly(index)) 
{
+                                       gpuResources.add(new 
GPUInfo(index.trim()));
+                               }
+                       }
+               }
+               LOG.info("Discover GPU resources: {}.", gpuResources);
+               return Collections.unmodifiableSet(gpuResources);
+       }
+
+       private String executeDiscoveryScript(File discoveryScript, long 
gpuAmount, String args) throws Exception {
+               final String cmd = discoveryScript.getAbsolutePath() + " " + 
gpuAmount + " " + args;
+               final Process process = Runtime.getRuntime().exec(cmd);
+               final BufferedReader stdoutReader = new BufferedReader(new 
InputStreamReader(process.getInputStream()));
+               final BufferedReader stderrReader = new BufferedReader(new 
InputStreamReader(process.getErrorStream()));
+               final boolean exitInTime = 
process.waitFor(DISCOVERY_SCRIPT_TIMEOUT_MS, TimeUnit.MILLISECONDS);
+               if (!exitInTime) {
+                       throw new TimeoutException(String.format("The discovery 
script executed for over %d ms.", DISCOVERY_SCRIPT_TIMEOUT_MS));
+               }
+
+               final int exitVal = process.exitValue();
+               if (exitVal != 0) {
+                       final String stdout = 
stdoutReader.lines().collect(StringBuilder::new, StringBuilder::append, 
StringBuilder::append).toString();
+                       final String stderr = 
stderrReader.lines().collect(StringBuilder::new, StringBuilder::append, 
StringBuilder::append).toString();
+                       LOG.error("Discovery script exit with {}. With output 
{} {}", exitVal, stdout, stderr);

Review comment:
       ```suggestion
                        LOG.warn("Discovery script exit with {}.\nSTDOUT: 
{}\nSTDERR: {}", exitVal, stdout, stderr);
   ```

##########
File path: 
flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java
##########
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.externalresource.gpu;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.api.common.externalresource.ExternalResourceDriver;
+import org.apache.flink.configuration.ConfigConstants;
+import org.apache.flink.configuration.ConfigOption;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ExternalResourceOptions;
+import org.apache.flink.configuration.IllegalConfigurationException;
+import org.apache.flink.util.FlinkException;
+import org.apache.flink.util.Preconditions;
+import org.apache.flink.util.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import static org.apache.flink.configuration.ConfigOptions.key;
+
+/**
+ * Driver takes the responsibility to discover GPU resources and provide the 
GPU resource information.
+ * It get the GPU information by executing user-defined discovery script.
+ */
+class GPUDriver implements ExternalResourceDriver {
+
+       private static final Logger LOG = 
LoggerFactory.getLogger(GPUDriver.class);
+
+       private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000;
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_PATH =
+               key("discovery-script.path")
+                       .stringType()
+                       
.defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", 
ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS));
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_ARG =
+               key("discovery-script.args")
+                       .stringType()
+                       .noDefaultValue();
+
+       private final File discoveryScriptFile;
+       private final String args;
+
+       GPUDriver(Configuration config) throws Exception {
+               final String discoveryScriptPathStr = 
config.getString(DISCOVERY_SCRIPT_PATH);
+               if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) 
{
+                       throw new IllegalConfigurationException(
+                               String.format("GPU discovery script ('%s') is 
not configured.", 
ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key())));
+               }
+
+               Path discoveryScriptPath = Paths.get(discoveryScriptPathStr);
+               if (!discoveryScriptPath.isAbsolute()) {
+                       discoveryScriptPath = 
Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, 
"."), discoveryScriptPathStr);
+               }
+               discoveryScriptFile = discoveryScriptPath.toFile();
+
+               if (!discoveryScriptFile.exists()) {
+                       throw new FileNotFoundException(String.format("The gpu 
discovery script does not exist in path %s.", 
discoveryScriptFile.getAbsolutePath()));
+               }
+               if (!discoveryScriptFile.canExecute()) {
+                       throw new FlinkException(String.format("The discovery 
script %s is not executable.", discoveryScriptFile.getAbsolutePath()));
+               }
+
+               args = config.getString(DISCOVERY_SCRIPT_ARG);
+       }
+
+       @Override
+       public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws 
Exception {
+               Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount 
should be positive while finding " + gpuAmount);
+
+               final Set<GPUInfo> gpuResources = new HashSet<>();
+               String output = executeDiscoveryScript(discoveryScriptFile, 
gpuAmount, args);
+               if (!output.isEmpty()) {
+                       String[] indexes = output.split(",");
+                       for (String index : indexes) {
+                               if (!StringUtils.isNullOrWhitespaceOnly(index)) 
{
+                                       gpuResources.add(new 
GPUInfo(index.trim()));
+                               }
+                       }
+               }
+               LOG.info("Discover GPU resources: {}.", gpuResources);
+               return Collections.unmodifiableSet(gpuResources);
+       }
+
+       private String executeDiscoveryScript(File discoveryScript, long 
gpuAmount, String args) throws Exception {
+               final String cmd = discoveryScript.getAbsolutePath() + " " + 
gpuAmount + " " + args;
+               final Process process = Runtime.getRuntime().exec(cmd);
+               final BufferedReader stdoutReader = new BufferedReader(new 
InputStreamReader(process.getInputStream()));
+               final BufferedReader stderrReader = new BufferedReader(new 
InputStreamReader(process.getErrorStream()));
+               final boolean exitInTime = 
process.waitFor(DISCOVERY_SCRIPT_TIMEOUT_MS, TimeUnit.MILLISECONDS);
+               if (!exitInTime) {
+                       throw new TimeoutException(String.format("The discovery 
script executed for over %d ms.", DISCOVERY_SCRIPT_TIMEOUT_MS));
+               }
+
+               final int exitVal = process.exitValue();
+               if (exitVal != 0) {
+                       final String stdout = 
stdoutReader.lines().collect(StringBuilder::new, StringBuilder::append, 
StringBuilder::append).toString();
+                       final String stderr = 
stderrReader.lines().collect(StringBuilder::new, StringBuilder::append, 
StringBuilder::append).toString();
+                       LOG.error("Discovery script exit with {}. With output 
{} {}", exitVal, stdout, stderr);
+                       throw new FlinkException("Discovery script exit with 
non-zero.");

Review comment:
       ```suggestion
                        throw new FlinkException(String.format("Discovery 
script exit with non-zero return code: %s.", exitVal));
   ```

##########
File path: 
flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java
##########
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.externalresource.gpu;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.api.common.externalresource.ExternalResourceDriver;
+import org.apache.flink.configuration.ConfigConstants;
+import org.apache.flink.configuration.ConfigOption;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ExternalResourceOptions;
+import org.apache.flink.configuration.IllegalConfigurationException;
+import org.apache.flink.util.FlinkException;
+import org.apache.flink.util.Preconditions;
+import org.apache.flink.util.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import static org.apache.flink.configuration.ConfigOptions.key;
+
+/**
+ * Driver takes the responsibility to discover GPU resources and provide the 
GPU resource information.
+ * It get the GPU information by executing user-defined discovery script.
+ */
+class GPUDriver implements ExternalResourceDriver {
+
+       private static final Logger LOG = 
LoggerFactory.getLogger(GPUDriver.class);
+
+       private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000;
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_PATH =
+               key("discovery-script.path")
+                       .stringType()
+                       
.defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", 
ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS));
+
+       @VisibleForTesting
+       static final ConfigOption<String> DISCOVERY_SCRIPT_ARG =
+               key("discovery-script.args")
+                       .stringType()
+                       .noDefaultValue();
+
+       private final File discoveryScriptFile;
+       private final String args;
+
+       GPUDriver(Configuration config) throws Exception {
+               final String discoveryScriptPathStr = 
config.getString(DISCOVERY_SCRIPT_PATH);
+               if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) 
{
+                       throw new IllegalConfigurationException(
+                               String.format("GPU discovery script ('%s') is 
not configured.", 
ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key())));
+               }
+
+               Path discoveryScriptPath = Paths.get(discoveryScriptPathStr);
+               if (!discoveryScriptPath.isAbsolute()) {
+                       discoveryScriptPath = 
Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, 
"."), discoveryScriptPathStr);
+               }
+               discoveryScriptFile = discoveryScriptPath.toFile();
+
+               if (!discoveryScriptFile.exists()) {
+                       throw new FileNotFoundException(String.format("The gpu 
discovery script does not exist in path %s.", 
discoveryScriptFile.getAbsolutePath()));
+               }
+               if (!discoveryScriptFile.canExecute()) {
+                       throw new FlinkException(String.format("The discovery 
script %s is not executable.", discoveryScriptFile.getAbsolutePath()));
+               }
+
+               args = config.getString(DISCOVERY_SCRIPT_ARG);
+       }
+
+       @Override
+       public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws 
Exception {
+               Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount 
should be positive while finding " + gpuAmount);
+
+               final Set<GPUInfo> gpuResources = new HashSet<>();
+               String output = executeDiscoveryScript(discoveryScriptFile, 
gpuAmount, args);
+               if (!output.isEmpty()) {
+                       String[] indexes = output.split(",");
+                       for (String index : indexes) {
+                               if (!StringUtils.isNullOrWhitespaceOnly(index)) 
{
+                                       gpuResources.add(new 
GPUInfo(index.trim()));
+                               }
+                       }
+               }
+               LOG.info("Discover GPU resources: {}.", gpuResources);
+               return Collections.unmodifiableSet(gpuResources);
+       }
+
+       private String executeDiscoveryScript(File discoveryScript, long 
gpuAmount, String args) throws Exception {
+               final String cmd = discoveryScript.getAbsolutePath() + " " + 
gpuAmount + " " + args;
+               final Process process = Runtime.getRuntime().exec(cmd);
+               final BufferedReader stdoutReader = new BufferedReader(new 
InputStreamReader(process.getInputStream()));
+               final BufferedReader stderrReader = new BufferedReader(new 
InputStreamReader(process.getErrorStream()));
+               final boolean exitInTime = 
process.waitFor(DISCOVERY_SCRIPT_TIMEOUT_MS, TimeUnit.MILLISECONDS);

Review comment:
       ```suggestion
                final boolean hasProcessTerminated = 
process.waitFor(DISCOVERY_SCRIPT_TIMEOUT_MS, TimeUnit.MILLISECONDS);
   ```

##########
File path: 
flink-external-resources/flink-external-resource-gpu/src/main/resources/gpu-discovery-common.sh
##########
@@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+################################################################################
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+non_coordination_allocate() {
+  indexes=($1)
+  amount=$2
+  to_occupy_indexes=(${indexes[@]:0:$amount})
+  if [ $amount -gt ${#to_occupy_indexes[@]} ]; then
+    echo "Could not get enough GPU resources."
+    exit 1
+  fi
+  echo ${to_occupy_indexes[@]} | sed 's/ /,/g'
+}
+
+coordination_allocate() {
+  indexes=($1)
+  amount=$2
+  coordination_file=${3:-/var/tmp/flink-gpu-coordination}
+  (
+    flock -x 200
+    # GPU indexes to be occupied.
+    to_occupy_indexes=()
+    # GPU indexes which are already recorded in the coordination file. These 
indexes should not be occupied unless the associated
+    # processes are no longer alive.
+    recorded_indexes=()
+    for i in ${indexes[@]}
+    do
+      if [ ${#to_occupy_indexes[@]} -eq $amount ]; then
+        break
+      elif [ `grep -c "^$i " $coordination_file` -ne 0 ]; then
+        recorded_indexes[${#recorded_indexes[@]}]=$i
+      else
+        to_occupy_indexes[${#to_occupy_indexes[@]}]=$i
+      fi
+    done
+
+    # If there is no enough indexes, we will try to occupy indexes whose 
associated processes are dead.

Review comment:
       ```suggestion
       # If there are not enough indexes, we will try to occupy indexes whose 
associated processes are dead.
   ```

##########
File path: 
flink-external-resources/flink-external-resource-gpu/src/test/java/org/apache/flink/externalresource/gpu/GPUDiscoveryScriptTest.java
##########
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.externalresource.gpu;
+
+import org.apache.flink.util.OperatingSystem;
+import org.apache.flink.util.TestLogger;
+
+import org.junit.Test;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+
+import static org.junit.Assume.assumeTrue;
+
+/**
+ * Tests for the gpu-discovery-common.sh.
+ */
+public class GPUDiscoveryScriptTest extends TestLogger {
+
+       private static final String TEST_SCRIPT_PATH = 
"src/test/resources/test-coordination-mode.sh";
+
+       @Test
+       public void testNonCoordinationMode() throws Exception {
+               assumeTrue(OperatingSystem.isLinux());
+               testExistWithNonZero("test_non_coordination_mode");
+       }
+
+       @Test
+       public void testCoordinateIndexes() throws Exception {
+               assumeTrue(OperatingSystem.isLinux());
+               testExistWithNonZero("test_coordinate_indexes");
+       }
+
+       @Test
+       public void testPreemptFromDeadProcesses() throws Exception {
+               assumeTrue(OperatingSystem.isLinux());
+               testExistWithNonZero("test_preempt_from_dead_processes");
+       }
+
+       @Test
+       public void testSetCoordinationFile() throws Exception {
+               assumeTrue(OperatingSystem.isLinux());
+               testExistWithNonZero("test_coordination_file");
+       }
+
+       private void testExistWithNonZero(String cmd) throws Exception {
+               final ProcessBuilder processBuilder = new 
ProcessBuilder(TEST_SCRIPT_PATH, cmd);
+               processBuilder.redirectErrorStream(true);
+               final Process process = processBuilder.start();
+               final int exitValue = process.waitFor();
+               if (exitValue != 0) {
+                       final BufferedReader stdoutReader = new 
BufferedReader(new InputStreamReader(process.getInputStream()));
+                       final String stdout = 
stdoutReader.lines().collect(StringBuilder::new, StringBuilder::append, 
StringBuilder::append).toString();
+                       log.error("Script exit with non-zero {}, output: {}", 
exitValue, stdout);

Review comment:
       I would suggest to add the `exitValue` and `stdout` to the thrown 
`Exception` instead of logging it here yourself. The thrown exception will be 
logged by Junit.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to