tillrohrmann commented on a change in pull request #11920: URL: https://github.com/apache/flink/pull/11920#discussion_r426163109
########## File path: flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java ########## @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.externalresource.gpu; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.externalresource.ExternalResourceDriver; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ExternalResourceOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.StringUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.InputStreamReader; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static org.apache.flink.configuration.ConfigOptions.key; + +/** + * Driver takes the responsibility to discover GPU resources and provide the GPU resource information. + * It get the GPU information by executing user-defined discovery script. Review comment: ```suggestion * It retrieves the GPU information by executing a user-defined discovery script. ``` ########## File path: flink-dist/src/main/assemblies/opt.xml ########## @@ -75,6 +75,28 @@ <fileMode>0644</fileMode> </file> + <!-- External Resource --> + <file> + <source>../flink-external-resources/flink-external-resource-gpu/target/flink-external-resource-gpu-${project.version}.jar</source> + <outputDirectory>opt/external-resource-gpu/</outputDirectory> + <destName>flink-external-resource-gpu-${project.version}.jar</destName> + <fileMode>0644</fileMode> + </file> + + <file> + <source>../flink-external-resources/flink-external-resource-gpu/src/main/resources/gpu-discovery-common.sh</source> + <outputDirectory>opt/external-resource-gpu/</outputDirectory> + <destName>gpu-discovery-common.sh</destName> + <fileMode>0755</fileMode> + </file> + + <file> + <source>../flink-external-resources/flink-external-resource-gpu/src/main/resources/nvidia-gpu-discovery.sh</source> + <outputDirectory>opt/external-resource-gpu/</outputDirectory> + <destName>nvidia-gpu-discovery.sh</destName> + <fileMode>0755</fileMode> + </file> Review comment: I'm wondering whether we should add the external gpu support right away to the Flink binary distribution or whether it would also be ok to download it from the Flink web page? How large are the binaries we are adding here? ########## File path: flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java ########## @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.externalresource.gpu; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.externalresource.ExternalResourceDriver; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ExternalResourceOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.StringUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.InputStreamReader; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static org.apache.flink.configuration.ConfigOptions.key; + +/** + * Driver takes the responsibility to discover GPU resources and provide the GPU resource information. + * It get the GPU information by executing user-defined discovery script. + */ +class GPUDriver implements ExternalResourceDriver { + + private static final Logger LOG = LoggerFactory.getLogger(GPUDriver.class); + + private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000; + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_PATH = + key("discovery-script.path") + .stringType() + .defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS)); + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_ARG = + key("discovery-script.args") + .stringType() + .noDefaultValue(); + + private final File discoveryScriptFile; + private final String args; + + GPUDriver(Configuration config) throws Exception { + final String discoveryScriptPathStr = config.getString(DISCOVERY_SCRIPT_PATH); + if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) { + throw new IllegalConfigurationException( + String.format("GPU discovery script ('%s') is not configured.", ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key()))); + } + + Path discoveryScriptPath = Paths.get(discoveryScriptPathStr); + if (!discoveryScriptPath.isAbsolute()) { + discoveryScriptPath = Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, "."), discoveryScriptPathStr); + } + discoveryScriptFile = discoveryScriptPath.toFile(); + + if (!discoveryScriptFile.exists()) { + throw new FileNotFoundException(String.format("The gpu discovery script does not exist in path %s.", discoveryScriptFile.getAbsolutePath())); + } + if (!discoveryScriptFile.canExecute()) { + throw new FlinkException(String.format("The discovery script %s is not executable.", discoveryScriptFile.getAbsolutePath())); + } + + args = config.getString(DISCOVERY_SCRIPT_ARG); + } + + @Override + public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws Exception { + Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount should be positive while finding " + gpuAmount); + + final Set<GPUInfo> gpuResources = new HashSet<>(); + String output = executeDiscoveryScript(discoveryScriptFile, gpuAmount, args); + if (!output.isEmpty()) { + String[] indexes = output.split(","); + for (String index : indexes) { + if (!StringUtils.isNullOrWhitespaceOnly(index)) { + gpuResources.add(new GPUInfo(index.trim())); + } + } + } + LOG.info("Discover GPU resources: {}.", gpuResources); + return Collections.unmodifiableSet(gpuResources); + } + + private String executeDiscoveryScript(File discoveryScript, long gpuAmount, String args) throws Exception { + final String cmd = discoveryScript.getAbsolutePath() + " " + gpuAmount + " " + args; + final Process process = Runtime.getRuntime().exec(cmd); + final BufferedReader stdoutReader = new BufferedReader(new InputStreamReader(process.getInputStream())); + final BufferedReader stderrReader = new BufferedReader(new InputStreamReader(process.getErrorStream())); Review comment: I would suggest to use try-with-resources to create the `BufferedReaders` ``` try (BufferedReader stdoutReader = new BufferedReader(); BufferedReader stderrReader = new BufferedReader()) { } ``` ########## File path: flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java ########## @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.externalresource.gpu; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.externalresource.ExternalResourceDriver; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ExternalResourceOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.StringUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.InputStreamReader; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static org.apache.flink.configuration.ConfigOptions.key; + +/** + * Driver takes the responsibility to discover GPU resources and provide the GPU resource information. + * It get the GPU information by executing user-defined discovery script. + */ +class GPUDriver implements ExternalResourceDriver { + + private static final Logger LOG = LoggerFactory.getLogger(GPUDriver.class); + + private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000; + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_PATH = + key("discovery-script.path") + .stringType() + .defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS)); + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_ARG = + key("discovery-script.args") + .stringType() + .noDefaultValue(); + + private final File discoveryScriptFile; + private final String args; + + GPUDriver(Configuration config) throws Exception { + final String discoveryScriptPathStr = config.getString(DISCOVERY_SCRIPT_PATH); + if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) { + throw new IllegalConfigurationException( + String.format("GPU discovery script ('%s') is not configured.", ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key()))); + } + + Path discoveryScriptPath = Paths.get(discoveryScriptPathStr); + if (!discoveryScriptPath.isAbsolute()) { + discoveryScriptPath = Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, "."), discoveryScriptPathStr); + } + discoveryScriptFile = discoveryScriptPath.toFile(); + + if (!discoveryScriptFile.exists()) { + throw new FileNotFoundException(String.format("The gpu discovery script does not exist in path %s.", discoveryScriptFile.getAbsolutePath())); + } + if (!discoveryScriptFile.canExecute()) { + throw new FlinkException(String.format("The discovery script %s is not executable.", discoveryScriptFile.getAbsolutePath())); + } + + args = config.getString(DISCOVERY_SCRIPT_ARG); + } + + @Override + public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws Exception { + Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount should be positive while finding " + gpuAmount); + + final Set<GPUInfo> gpuResources = new HashSet<>(); + String output = executeDiscoveryScript(discoveryScriptFile, gpuAmount, args); + if (!output.isEmpty()) { + String[] indexes = output.split(","); + for (String index : indexes) { + if (!StringUtils.isNullOrWhitespaceOnly(index)) { + gpuResources.add(new GPUInfo(index.trim())); + } + } + } + LOG.info("Discover GPU resources: {}.", gpuResources); + return Collections.unmodifiableSet(gpuResources); + } + + private String executeDiscoveryScript(File discoveryScript, long gpuAmount, String args) throws Exception { + final String cmd = discoveryScript.getAbsolutePath() + " " + gpuAmount + " " + args; + final Process process = Runtime.getRuntime().exec(cmd); Review comment: We should add a finally block to the end of the try-with-resources block which create the `BufferedReaders` and make sure that `process` is being destroyed. We can do this by calling `Process.destroyForcibly()`. ########## File path: flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java ########## @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.externalresource.gpu; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.externalresource.ExternalResourceDriver; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ExternalResourceOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.StringUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.InputStreamReader; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static org.apache.flink.configuration.ConfigOptions.key; + +/** + * Driver takes the responsibility to discover GPU resources and provide the GPU resource information. + * It get the GPU information by executing user-defined discovery script. + */ +class GPUDriver implements ExternalResourceDriver { + + private static final Logger LOG = LoggerFactory.getLogger(GPUDriver.class); + + private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000; + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_PATH = + key("discovery-script.path") + .stringType() + .defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS)); + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_ARG = + key("discovery-script.args") + .stringType() + .noDefaultValue(); + + private final File discoveryScriptFile; + private final String args; + + GPUDriver(Configuration config) throws Exception { + final String discoveryScriptPathStr = config.getString(DISCOVERY_SCRIPT_PATH); + if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) { + throw new IllegalConfigurationException( + String.format("GPU discovery script ('%s') is not configured.", ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key()))); + } + + Path discoveryScriptPath = Paths.get(discoveryScriptPathStr); + if (!discoveryScriptPath.isAbsolute()) { + discoveryScriptPath = Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, "."), discoveryScriptPathStr); + } + discoveryScriptFile = discoveryScriptPath.toFile(); + + if (!discoveryScriptFile.exists()) { + throw new FileNotFoundException(String.format("The gpu discovery script does not exist in path %s.", discoveryScriptFile.getAbsolutePath())); + } + if (!discoveryScriptFile.canExecute()) { + throw new FlinkException(String.format("The discovery script %s is not executable.", discoveryScriptFile.getAbsolutePath())); + } + + args = config.getString(DISCOVERY_SCRIPT_ARG); + } + + @Override + public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws Exception { + Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount should be positive while finding " + gpuAmount); Review comment: ```suggestion Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount should be positive when retrieving the GPU resource information."); ``` ########## File path: flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java ########## @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.externalresource.gpu; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.externalresource.ExternalResourceDriver; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ExternalResourceOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.StringUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.InputStreamReader; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static org.apache.flink.configuration.ConfigOptions.key; + +/** + * Driver takes the responsibility to discover GPU resources and provide the GPU resource information. + * It get the GPU information by executing user-defined discovery script. + */ +class GPUDriver implements ExternalResourceDriver { + + private static final Logger LOG = LoggerFactory.getLogger(GPUDriver.class); + + private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000; + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_PATH = + key("discovery-script.path") + .stringType() + .defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS)); + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_ARG = + key("discovery-script.args") + .stringType() + .noDefaultValue(); + + private final File discoveryScriptFile; + private final String args; + + GPUDriver(Configuration config) throws Exception { + final String discoveryScriptPathStr = config.getString(DISCOVERY_SCRIPT_PATH); + if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) { + throw new IllegalConfigurationException( + String.format("GPU discovery script ('%s') is not configured.", ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key()))); + } + + Path discoveryScriptPath = Paths.get(discoveryScriptPathStr); + if (!discoveryScriptPath.isAbsolute()) { + discoveryScriptPath = Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, "."), discoveryScriptPathStr); + } + discoveryScriptFile = discoveryScriptPath.toFile(); + + if (!discoveryScriptFile.exists()) { + throw new FileNotFoundException(String.format("The gpu discovery script does not exist in path %s.", discoveryScriptFile.getAbsolutePath())); + } + if (!discoveryScriptFile.canExecute()) { + throw new FlinkException(String.format("The discovery script %s is not executable.", discoveryScriptFile.getAbsolutePath())); + } + + args = config.getString(DISCOVERY_SCRIPT_ARG); + } + + @Override + public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws Exception { + Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount should be positive while finding " + gpuAmount); + + final Set<GPUInfo> gpuResources = new HashSet<>(); + String output = executeDiscoveryScript(discoveryScriptFile, gpuAmount, args); + if (!output.isEmpty()) { + String[] indexes = output.split(","); + for (String index : indexes) { + if (!StringUtils.isNullOrWhitespaceOnly(index)) { + gpuResources.add(new GPUInfo(index.trim())); + } + } + } + LOG.info("Discover GPU resources: {}.", gpuResources); + return Collections.unmodifiableSet(gpuResources); + } + + private String executeDiscoveryScript(File discoveryScript, long gpuAmount, String args) throws Exception { + final String cmd = discoveryScript.getAbsolutePath() + " " + gpuAmount + " " + args; + final Process process = Runtime.getRuntime().exec(cmd); + final BufferedReader stdoutReader = new BufferedReader(new InputStreamReader(process.getInputStream())); + final BufferedReader stderrReader = new BufferedReader(new InputStreamReader(process.getErrorStream())); + final boolean exitInTime = process.waitFor(DISCOVERY_SCRIPT_TIMEOUT_MS, TimeUnit.MILLISECONDS); + if (!exitInTime) { + throw new TimeoutException(String.format("The discovery script executed for over %d ms.", DISCOVERY_SCRIPT_TIMEOUT_MS)); + } + + final int exitVal = process.exitValue(); + if (exitVal != 0) { + final String stdout = stdoutReader.lines().collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString(); + final String stderr = stderrReader.lines().collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString(); + LOG.error("Discovery script exit with {}. With output {} {}", exitVal, stdout, stderr); Review comment: ```suggestion LOG.warn("Discovery script exit with {}.\nSTDOUT: {}\nSTDERR: {}", exitVal, stdout, stderr); ``` ########## File path: flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java ########## @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.externalresource.gpu; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.externalresource.ExternalResourceDriver; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ExternalResourceOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.StringUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.InputStreamReader; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static org.apache.flink.configuration.ConfigOptions.key; + +/** + * Driver takes the responsibility to discover GPU resources and provide the GPU resource information. + * It get the GPU information by executing user-defined discovery script. + */ +class GPUDriver implements ExternalResourceDriver { + + private static final Logger LOG = LoggerFactory.getLogger(GPUDriver.class); + + private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000; + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_PATH = + key("discovery-script.path") + .stringType() + .defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS)); + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_ARG = + key("discovery-script.args") + .stringType() + .noDefaultValue(); + + private final File discoveryScriptFile; + private final String args; + + GPUDriver(Configuration config) throws Exception { + final String discoveryScriptPathStr = config.getString(DISCOVERY_SCRIPT_PATH); + if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) { + throw new IllegalConfigurationException( + String.format("GPU discovery script ('%s') is not configured.", ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key()))); + } + + Path discoveryScriptPath = Paths.get(discoveryScriptPathStr); + if (!discoveryScriptPath.isAbsolute()) { + discoveryScriptPath = Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, "."), discoveryScriptPathStr); + } + discoveryScriptFile = discoveryScriptPath.toFile(); + + if (!discoveryScriptFile.exists()) { + throw new FileNotFoundException(String.format("The gpu discovery script does not exist in path %s.", discoveryScriptFile.getAbsolutePath())); + } + if (!discoveryScriptFile.canExecute()) { + throw new FlinkException(String.format("The discovery script %s is not executable.", discoveryScriptFile.getAbsolutePath())); + } + + args = config.getString(DISCOVERY_SCRIPT_ARG); + } + + @Override + public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws Exception { + Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount should be positive while finding " + gpuAmount); + + final Set<GPUInfo> gpuResources = new HashSet<>(); + String output = executeDiscoveryScript(discoveryScriptFile, gpuAmount, args); + if (!output.isEmpty()) { + String[] indexes = output.split(","); + for (String index : indexes) { + if (!StringUtils.isNullOrWhitespaceOnly(index)) { + gpuResources.add(new GPUInfo(index.trim())); + } + } + } + LOG.info("Discover GPU resources: {}.", gpuResources); + return Collections.unmodifiableSet(gpuResources); + } + + private String executeDiscoveryScript(File discoveryScript, long gpuAmount, String args) throws Exception { + final String cmd = discoveryScript.getAbsolutePath() + " " + gpuAmount + " " + args; + final Process process = Runtime.getRuntime().exec(cmd); + final BufferedReader stdoutReader = new BufferedReader(new InputStreamReader(process.getInputStream())); + final BufferedReader stderrReader = new BufferedReader(new InputStreamReader(process.getErrorStream())); + final boolean exitInTime = process.waitFor(DISCOVERY_SCRIPT_TIMEOUT_MS, TimeUnit.MILLISECONDS); + if (!exitInTime) { + throw new TimeoutException(String.format("The discovery script executed for over %d ms.", DISCOVERY_SCRIPT_TIMEOUT_MS)); + } + + final int exitVal = process.exitValue(); + if (exitVal != 0) { + final String stdout = stdoutReader.lines().collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString(); + final String stderr = stderrReader.lines().collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString(); + LOG.error("Discovery script exit with {}. With output {} {}", exitVal, stdout, stderr); + throw new FlinkException("Discovery script exit with non-zero."); Review comment: ```suggestion throw new FlinkException(String.format("Discovery script exit with non-zero return code: %s.", exitVal)); ``` ########## File path: flink-external-resources/flink-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java ########## @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.externalresource.gpu; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.externalresource.ExternalResourceDriver; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ExternalResourceOptions; +import org.apache.flink.configuration.IllegalConfigurationException; +import org.apache.flink.util.FlinkException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.StringUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.InputStreamReader; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static org.apache.flink.configuration.ConfigOptions.key; + +/** + * Driver takes the responsibility to discover GPU resources and provide the GPU resource information. + * It get the GPU information by executing user-defined discovery script. + */ +class GPUDriver implements ExternalResourceDriver { + + private static final Logger LOG = LoggerFactory.getLogger(GPUDriver.class); + + private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000; + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_PATH = + key("discovery-script.path") + .stringType() + .defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS)); + + @VisibleForTesting + static final ConfigOption<String> DISCOVERY_SCRIPT_ARG = + key("discovery-script.args") + .stringType() + .noDefaultValue(); + + private final File discoveryScriptFile; + private final String args; + + GPUDriver(Configuration config) throws Exception { + final String discoveryScriptPathStr = config.getString(DISCOVERY_SCRIPT_PATH); + if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) { + throw new IllegalConfigurationException( + String.format("GPU discovery script ('%s') is not configured.", ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key()))); + } + + Path discoveryScriptPath = Paths.get(discoveryScriptPathStr); + if (!discoveryScriptPath.isAbsolute()) { + discoveryScriptPath = Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, "."), discoveryScriptPathStr); + } + discoveryScriptFile = discoveryScriptPath.toFile(); + + if (!discoveryScriptFile.exists()) { + throw new FileNotFoundException(String.format("The gpu discovery script does not exist in path %s.", discoveryScriptFile.getAbsolutePath())); + } + if (!discoveryScriptFile.canExecute()) { + throw new FlinkException(String.format("The discovery script %s is not executable.", discoveryScriptFile.getAbsolutePath())); + } + + args = config.getString(DISCOVERY_SCRIPT_ARG); + } + + @Override + public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws Exception { + Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount should be positive while finding " + gpuAmount); + + final Set<GPUInfo> gpuResources = new HashSet<>(); + String output = executeDiscoveryScript(discoveryScriptFile, gpuAmount, args); + if (!output.isEmpty()) { + String[] indexes = output.split(","); + for (String index : indexes) { + if (!StringUtils.isNullOrWhitespaceOnly(index)) { + gpuResources.add(new GPUInfo(index.trim())); + } + } + } + LOG.info("Discover GPU resources: {}.", gpuResources); + return Collections.unmodifiableSet(gpuResources); + } + + private String executeDiscoveryScript(File discoveryScript, long gpuAmount, String args) throws Exception { + final String cmd = discoveryScript.getAbsolutePath() + " " + gpuAmount + " " + args; + final Process process = Runtime.getRuntime().exec(cmd); + final BufferedReader stdoutReader = new BufferedReader(new InputStreamReader(process.getInputStream())); + final BufferedReader stderrReader = new BufferedReader(new InputStreamReader(process.getErrorStream())); + final boolean exitInTime = process.waitFor(DISCOVERY_SCRIPT_TIMEOUT_MS, TimeUnit.MILLISECONDS); Review comment: ```suggestion final boolean hasProcessTerminated = process.waitFor(DISCOVERY_SCRIPT_TIMEOUT_MS, TimeUnit.MILLISECONDS); ``` ########## File path: flink-external-resources/flink-external-resource-gpu/src/main/resources/gpu-discovery-common.sh ########## @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +non_coordination_allocate() { + indexes=($1) + amount=$2 + to_occupy_indexes=(${indexes[@]:0:$amount}) + if [ $amount -gt ${#to_occupy_indexes[@]} ]; then + echo "Could not get enough GPU resources." + exit 1 + fi + echo ${to_occupy_indexes[@]} | sed 's/ /,/g' +} + +coordination_allocate() { + indexes=($1) + amount=$2 + coordination_file=${3:-/var/tmp/flink-gpu-coordination} + ( + flock -x 200 + # GPU indexes to be occupied. + to_occupy_indexes=() + # GPU indexes which are already recorded in the coordination file. These indexes should not be occupied unless the associated + # processes are no longer alive. + recorded_indexes=() + for i in ${indexes[@]} + do + if [ ${#to_occupy_indexes[@]} -eq $amount ]; then + break + elif [ `grep -c "^$i " $coordination_file` -ne 0 ]; then + recorded_indexes[${#recorded_indexes[@]}]=$i + else + to_occupy_indexes[${#to_occupy_indexes[@]}]=$i + fi + done + + # If there is no enough indexes, we will try to occupy indexes whose associated processes are dead. Review comment: ```suggestion # If there are not enough indexes, we will try to occupy indexes whose associated processes are dead. ``` ########## File path: flink-external-resources/flink-external-resource-gpu/src/test/java/org/apache/flink/externalresource/gpu/GPUDiscoveryScriptTest.java ########## @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.externalresource.gpu; + +import org.apache.flink.util.OperatingSystem; +import org.apache.flink.util.TestLogger; + +import org.junit.Test; + +import java.io.BufferedReader; +import java.io.InputStreamReader; + +import static org.junit.Assume.assumeTrue; + +/** + * Tests for the gpu-discovery-common.sh. + */ +public class GPUDiscoveryScriptTest extends TestLogger { + + private static final String TEST_SCRIPT_PATH = "src/test/resources/test-coordination-mode.sh"; + + @Test + public void testNonCoordinationMode() throws Exception { + assumeTrue(OperatingSystem.isLinux()); + testExistWithNonZero("test_non_coordination_mode"); + } + + @Test + public void testCoordinateIndexes() throws Exception { + assumeTrue(OperatingSystem.isLinux()); + testExistWithNonZero("test_coordinate_indexes"); + } + + @Test + public void testPreemptFromDeadProcesses() throws Exception { + assumeTrue(OperatingSystem.isLinux()); + testExistWithNonZero("test_preempt_from_dead_processes"); + } + + @Test + public void testSetCoordinationFile() throws Exception { + assumeTrue(OperatingSystem.isLinux()); + testExistWithNonZero("test_coordination_file"); + } + + private void testExistWithNonZero(String cmd) throws Exception { + final ProcessBuilder processBuilder = new ProcessBuilder(TEST_SCRIPT_PATH, cmd); + processBuilder.redirectErrorStream(true); + final Process process = processBuilder.start(); + final int exitValue = process.waitFor(); + if (exitValue != 0) { + final BufferedReader stdoutReader = new BufferedReader(new InputStreamReader(process.getInputStream())); + final String stdout = stdoutReader.lines().collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString(); + log.error("Script exit with non-zero {}, output: {}", exitValue, stdout); Review comment: I would suggest to add the `exitValue` and `stdout` to the thrown `Exception` instead of logging it here yourself. The thrown exception will be logged by Junit. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
