This is an automated email from the ASF dual-hosted git repository.

shanthoosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/samza.git


The following commit(s) were added to refs/heads/master by this push:
     new 82c53aaad SAMZA-2804: Concurrency issues identified in run-class.sh on 
samza-yarn (#1716)
82c53aaad is described below

commit 82c53aaad53c69f99ef5130658d9f93093a1e477
Author: Jon Bringhurst <j...@bringhurst.org>
AuthorDate: Fri May 2 15:22:30 2025 -0700

    SAMZA-2804: Concurrency issues identified in run-class.sh on samza-yarn 
(#1716)
    
    * Add annotations for each line identified as having a potential issue.
    
    * Resolve multiple concurrency issues
    
    ## Race condition in pathing jar manifest creation
    
    A race condition exists when setting up the classpath during container 
launch.
    
    During container launch using samza-yarn, run-class.sh creates a pathing 
jar file (which holds the classpath for the container launch). However, during 
the creation of this pathing jar, temporary files, as well as the pathing jar 
itself is not placed in a location unique to the container. This results in 
multiple containers writing to the same pathing jar location and temporary file 
location, which results in a race condition.
    
    This race condition may show up in several ways, such as when Yarn removes 
jars from a finished container (other containers will point to a classpath 
which no longer exists) or when multiple run-class.sh scripts attempt to write 
the manifest.txt or pathing jar at the same time.
    
    Note that host affinity being enabled will make this problem worse. The 
pathing.jar is written to the usercache, so when the container which created 
the pathing.jar is finished and removed, any new container which launches on 
that host will point to jar files which do not exist anymore. When host 
affinity is enabled, it will not move to a new host and just keep failing.
    
    ## Container logging directory fallback is not unique for each container
    
    The fallback log directory is the same among all containers running on the 
same host. It should be unique per-container.
    
    ## Container tmp dir is not unique per-container
    
    The JAVA_TMP_DIR directory is the same for all containers. We should make 
sure that it's safe to use the same directory for all containers.
    
    * Simplify comments and print manifest file locations
---
 samza-shell/src/main/bash/run-class.sh           | 29 ++++++++++++++++++++----
 samza-shell/src/main/bash/run-framework-class.sh | 26 +++++++++++++++++----
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/samza-shell/src/main/bash/run-class.sh 
b/samza-shell/src/main/bash/run-class.sh
index 1669332b5..2334f1d97 100755
--- a/samza-shell/src/main/bash/run-class.sh
+++ b/samza-shell/src/main/bash/run-class.sh
@@ -30,7 +30,12 @@ cd $home_dir
 
 echo "Current time: $(date '+%Y-%m-%d %H:%M:%S')"
 
+# For example, home_dir looks like:
+# /<hadoop dir>/usercache/<linux 
account>/appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027
 echo home_dir=$home_dir
+
+# For example, base_dir looks like:
+# /<hadoop path>/usercache/<linux 
account>/appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/__package
 echo "framework base (location of this script). base_dir=$base_dir"
 
 if [ ! -d "$base_dir/lib" ]; then
@@ -78,15 +83,23 @@ fi
 # permissions for the classpath-related files when they are in their own 
directory. An example of where
 # this is helpful is when using container images which might have predefined 
permissions for certain
 # directories.
-CLASSPATH_WORKSPACE_DIR=$base_dir/classpath_workspace
+
+# For example, CLASSPATH_WORKSPACE_DIR looks like:
+# /<hadoop dir>/usercache/<linux 
account>/appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/classpath_workspace
+CLASSPATH_WORKSPACE_DIR=$home_dir/classpath_workspace
 mkdir -p $CLASSPATH_WORKSPACE_DIR
+
 # file containing the classpath string; used to avoid passing long classpaths 
directly to the jar command
 PATHING_MANIFEST_FILE=$CLASSPATH_WORKSPACE_DIR/manifest.txt
+echo "Pathing manifest txt located at $PATHING_MANIFEST_FILE"
+
 # jar file to include on the classpath for running the main class
 PATHING_JAR_FILE=$CLASSPATH_WORKSPACE_DIR/pathing.jar
+echo "Pathing manifest jar located at $PATHING_JAR_FILE"
 
 # Newlines and spaces are intended to ensure proper parsing of manifest in 
pathing jar
 printf "Class-Path: \n $CLASSPATH \n" > $PATHING_MANIFEST_FILE
+
 # Creates a new archive and adds custom manifest information to pathing.jar
 eval "$JAR -cvmf $PATHING_MANIFEST_FILE $PATHING_JAR_FILE"
 
@@ -97,12 +110,18 @@ else
 fi
 
 if [ -z "$SAMZA_LOG_DIR" ]; then
-  SAMZA_LOG_DIR="$base_dir"
+  # SAMZA_LOG_DIR will point to the symlink located at:
+  # /<hadoop dir>/usercache/<linux 
account>/appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/logs
+  #
+  # When the symlink is resolved, this path will point to:
+  # /<hadoop 
dir>/userlogs/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027
+  SAMZA_LOG_DIR="$home_dir/logs"
 fi
 
-# add usercache directory
-mkdir -p $base_dir/tmp
-JAVA_TEMP_DIR=$base_dir/tmp
+# JAVA_TEMP_DIR will point to a path similar to:
+# /<hadoop dir>/usercache/<linux 
account>/appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/tmp
+mkdir -p $home_dir/tmp
+JAVA_TEMP_DIR=$home_dir/tmp
 
 # Check whether the JVM supports GC Log rotation, and enable it if so.
 function check_and_enable_gc_log_rotation {
diff --git a/samza-shell/src/main/bash/run-framework-class.sh 
b/samza-shell/src/main/bash/run-framework-class.sh
old mode 100644
new mode 100755
index 342047c58..cb652521e
--- a/samza-shell/src/main/bash/run-framework-class.sh
+++ b/samza-shell/src/main/bash/run-framework-class.sh
@@ -28,7 +28,12 @@ cd $base_dir
 base_dir=`pwd`
 cd $home_dir
 
+# Note: When using samza-yarn, home_dir looks like:
+# /<hadoop dir>/usercache/<linux 
account>/appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027
 echo home_dir=$home_dir
+
+# Note: When using samza-yarn, base_dir looks like:
+# /<hadoop path>/usercache/<linux 
account>/appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/__package
 echo "framework base (location of this script). base_dir=$base_dir"
 
 if [ ! -d "$base_dir/lib" ]; then
@@ -107,10 +112,15 @@ fi
 # permissions for the classpath-related files when they are in their own 
directory. An example of where
 # this is helpful is when using container images which might have predefined 
permissions for certain
 # directories.
-CLASSPATH_WORKSPACE_DIR=$base_dir/classpath_workspace
+
+# Note: When on samza-yarn, CLASSPATH_WORKSPACE_DIR looks like:
+# /<hadoop dir>/usercache/<linux 
account>/appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/classpath_workspace
+CLASSPATH_WORKSPACE_DIR=$home_dir/classpath_workspace
 mkdir -p $CLASSPATH_WORKSPACE_DIR
+
 # file containing the classpath string; used to avoid passing long classpaths 
directly to the jar command
 PATHING_MANIFEST_FILE=$CLASSPATH_WORKSPACE_DIR/manifest.txt
+
 # jar file to include on the classpath for running the main class
 PATHING_JAR_FILE=$CLASSPATH_WORKSPACE_DIR/pathing.jar
 
@@ -126,12 +136,18 @@ else
 fi
 
 if [ -z "$SAMZA_LOG_DIR" ]; then
-  SAMZA_LOG_DIR="$base_dir"
+  # When on samza-yarn, SAMZA_LOG_DIR will point to the symlink located at:
+  # /<hadoop dir>/usercache/<linux 
account>/appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/logs
+  #
+  # When the symlink is resolved, this path will point to:
+  # /<hadoop 
dir>/userlogs/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027
+  SAMZA_LOG_DIR="$home_dir"
 fi
 
-# add usercache directory
-mkdir -p $base_dir/tmp
-JAVA_TEMP_DIR=$base_dir/tmp
+# When on samza-yarn, JAVA_TEMP_DIR will point to a path similar to:
+# /<hadoop dir>/usercache/<linux 
account>/appcache/application_1745893616511_0059/container_e64_1745893616511_0059_01_002027/tmp
+mkdir -p $home_dir/tmp
+JAVA_TEMP_DIR=$home_dir/tmp
 
 # Check whether the JVM supports GC Log rotation, and enable it if so.
 function check_and_enable_gc_log_rotation {

Reply via email to