ayushtkn commented on code in PR #414: URL: https://github.com/apache/tez/pull/414#discussion_r2092566970
########## dev-support/bin/tez_run_example.sh: ########## @@ -0,0 +1,119 @@ + +# This script is used to set up a local Hadoop and Tez environment for running a simple word count example. +# Prerequisites +# 1. java is installed and JAVA_HOME is set +# 2. ssh localhost works without password + +# configure this if needed, by default it will use the latest stable versions in the current directory +export TEZ_VERSION=$(curl -s "https://downloads.apache.org/tez/" | grep -oP '\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 0.10.4 +export HADOOP_VERSION=$(curl -s "https://downloads.apache.org/hadoop/common/" | grep -oP 'hadoop-\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 3.4.1 +export HADOOP_STACK_HOME=$PWD + +echo "Demo script is running in $HADOOP_STACK_HOME with TEZ version $TEZ_VERSION and HADOOP version $HADOOP_VERSION" + +cd $HADOOP_STACK_HOME +wget -nc https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz +wget -nc https://archive.apache.org/dist/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz + +if [ ! -d "hadoop-$HADOOP_VERSION" ]; then + tar -xzf hadoop-$HADOOP_VERSION.tar.gz +fi + +if [ ! -d "apache-tez-$TEZ_VERSION-bin" ]; then + tar -xzf apache-tez-$TEZ_VERSION-bin.tar.gz +fi + +ln -s hadoop-$HADOOP_VERSION hadoop +ln -s apache-tez-$TEZ_VERSION-bin tez + +export HADOOP_HOME=$HADOOP_STACK_HOME/hadoop +export TEZ_HOME=$HADOOP_STACK_HOME/tez +export HADOOP_CLASSPATH=$TEZ_HOME/*:$TEZ_HOME/lib/*:$TEZ_HOME/conf + +export PATH=$PATH:$HADOOP_HOME/bin + +# https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation +cat <<EOF > $HADOOP_HOME/etc/hadoop/hdfs-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>dfs.replication</name> + <value>1</value> + </property> +</configuration> +EOF + +cat <<EOF > $HADOOP_HOME/etc/hadoop/core-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>fs.defaultFS</name> + <value>hdfs://localhost:9000</value> + </property> +</configuration> +EOF + +cat <<EOF > $HADOOP_HOME/etc/hadoop/yarn-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>yarn.nodemanager.aux-services</name> + <value>mapreduce_shuffle</value> + </property> +</configuration> +EOF + +# optionally stop previous clusters if any +#$HADOOP_HOME/sbin/stop-dfs.sh +#$HADOOP_HOME/sbin/stop-yarn.sh + +hdfs namenode -format + +$HADOOP_HOME/sbin/start-dfs.sh +$HADOOP_HOME/sbin/start-yarn.sh + +hadoop fs -mkdir /apps/ +hadoop fs -mkdir /apps/tez-$TEZ_VERSION Review Comment: ``` hadoop fs -mkdir -p /apps/tez-$TEZ_VERSION ``` ########## dev-support/bin/tez_run_example.sh: ########## @@ -0,0 +1,119 @@ + +# This script is used to set up a local Hadoop and Tez environment for running a simple word count example. +# Prerequisites +# 1. java is installed and JAVA_HOME is set +# 2. ssh localhost works without password + +# configure this if needed, by default it will use the latest stable versions in the current directory +export TEZ_VERSION=$(curl -s "https://downloads.apache.org/tez/" | grep -oP '\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 0.10.4 +export HADOOP_VERSION=$(curl -s "https://downloads.apache.org/hadoop/common/" | grep -oP 'hadoop-\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 3.4.1 Review Comment: shouldn't the hadoop version should be from the pom? not always the latest version is gonna work with Tez ########## dev-support/bin/tez_run_example.sh: ########## @@ -0,0 +1,119 @@ + +# This script is used to set up a local Hadoop and Tez environment for running a simple word count example. +# Prerequisites +# 1. java is installed and JAVA_HOME is set +# 2. ssh localhost works without password + +# configure this if needed, by default it will use the latest stable versions in the current directory +export TEZ_VERSION=$(curl -s "https://downloads.apache.org/tez/" | grep -oP '\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 0.10.4 +export HADOOP_VERSION=$(curl -s "https://downloads.apache.org/hadoop/common/" | grep -oP 'hadoop-\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 3.4.1 +export HADOOP_STACK_HOME=$PWD + +echo "Demo script is running in $HADOOP_STACK_HOME with TEZ version $TEZ_VERSION and HADOOP version $HADOOP_VERSION" + +cd $HADOOP_STACK_HOME +wget -nc https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz +wget -nc https://archive.apache.org/dist/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz + +if [ ! -d "hadoop-$HADOOP_VERSION" ]; then + tar -xzf hadoop-$HADOOP_VERSION.tar.gz +fi + +if [ ! -d "apache-tez-$TEZ_VERSION-bin" ]; then + tar -xzf apache-tez-$TEZ_VERSION-bin.tar.gz +fi + +ln -s hadoop-$HADOOP_VERSION hadoop +ln -s apache-tez-$TEZ_VERSION-bin tez + +export HADOOP_HOME=$HADOOP_STACK_HOME/hadoop +export TEZ_HOME=$HADOOP_STACK_HOME/tez +export HADOOP_CLASSPATH=$TEZ_HOME/*:$TEZ_HOME/lib/*:$TEZ_HOME/conf + +export PATH=$PATH:$HADOOP_HOME/bin + +# https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation +cat <<EOF > $HADOOP_HOME/etc/hadoop/hdfs-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>dfs.replication</name> + <value>1</value> + </property> +</configuration> +EOF + +cat <<EOF > $HADOOP_HOME/etc/hadoop/core-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>fs.defaultFS</name> + <value>hdfs://localhost:9000</value> + </property> +</configuration> +EOF + +cat <<EOF > $HADOOP_HOME/etc/hadoop/yarn-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>yarn.nodemanager.aux-services</name> + <value>mapreduce_shuffle</value> + </property> +</configuration> +EOF + +# optionally stop previous clusters if any +#$HADOOP_HOME/sbin/stop-dfs.sh +#$HADOOP_HOME/sbin/stop-yarn.sh + +hdfs namenode -format Review Comment: did you run it twice, usually if there was previous installation & if you run namenode -format, it asks for a prompt, are you sure you want to delete & you need to give Y ``` 2025-05-16 13:43:11,940 INFO snapshot.SnapshotManager: SkipList is disabled 2025-05-16 13:43:11,942 INFO util.GSet: Computing capacity for map cachedBlocks 2025-05-16 13:43:11,942 INFO util.GSet: VM type = 64-bit 2025-05-16 13:43:11,942 INFO util.GSet: 0.25% max memory 7.1 GB = 18.2 MB 2025-05-16 13:43:11,942 INFO util.GSet: capacity = 2^21 = 2097152 entries 2025-05-16 13:43:11,949 INFO metrics.TopMetrics: NNTop conf: dfs.namenode.top.window.num.buckets = 10 2025-05-16 13:43:11,949 INFO metrics.TopMetrics: NNTop conf: dfs.namenode.top.num.users = 10 2025-05-16 13:43:11,949 INFO metrics.TopMetrics: NNTop conf: dfs.namenode.top.windows.minutes = 1,5,25 2025-05-16 13:43:12,062 INFO namenode.FSNamesystem: Retry cache on namenode is enabled 2025-05-16 13:43:12,062 INFO namenode.FSNamesystem: Retry cache will use 0.03 of total heap and retry cache entry expiry time is 600000 millis 2025-05-16 13:43:12,064 INFO util.GSet: Computing capacity for map NameNodeRetryCache 2025-05-16 13:43:12,064 INFO util.GSet: VM type = 64-bit 2025-05-16 13:43:12,064 INFO util.GSet: 0.029999999329447746% max memory 7.1 GB = 2.2 MB 2025-05-16 13:43:12,064 INFO util.GSet: capacity = 2^18 = 262144 entries Re-format filesystem in Storage Directory root= /tmp/hadoop-ayushsaxena/dfs/name; location= null ? (Y or N) ``` ########## dev-support/bin/tez_run_example.sh: ########## @@ -0,0 +1,119 @@ + +# This script is used to set up a local Hadoop and Tez environment for running a simple word count example. +# Prerequisites +# 1. java is installed and JAVA_HOME is set +# 2. ssh localhost works without password + +# configure this if needed, by default it will use the latest stable versions in the current directory +export TEZ_VERSION=$(curl -s "https://downloads.apache.org/tez/" | grep -oP '\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 0.10.4 +export HADOOP_VERSION=$(curl -s "https://downloads.apache.org/hadoop/common/" | grep -oP 'hadoop-\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 3.4.1 +export HADOOP_STACK_HOME=$PWD + +echo "Demo script is running in $HADOOP_STACK_HOME with TEZ version $TEZ_VERSION and HADOOP version $HADOOP_VERSION" + +cd $HADOOP_STACK_HOME +wget -nc https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz +wget -nc https://archive.apache.org/dist/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz + +if [ ! -d "hadoop-$HADOOP_VERSION" ]; then + tar -xzf hadoop-$HADOOP_VERSION.tar.gz +fi + +if [ ! -d "apache-tez-$TEZ_VERSION-bin" ]; then + tar -xzf apache-tez-$TEZ_VERSION-bin.tar.gz +fi + +ln -s hadoop-$HADOOP_VERSION hadoop +ln -s apache-tez-$TEZ_VERSION-bin tez + +export HADOOP_HOME=$HADOOP_STACK_HOME/hadoop +export TEZ_HOME=$HADOOP_STACK_HOME/tez +export HADOOP_CLASSPATH=$TEZ_HOME/*:$TEZ_HOME/lib/*:$TEZ_HOME/conf + +export PATH=$PATH:$HADOOP_HOME/bin + +# https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation +cat <<EOF > $HADOOP_HOME/etc/hadoop/hdfs-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>dfs.replication</name> + <value>1</value> + </property> +</configuration> +EOF + +cat <<EOF > $HADOOP_HOME/etc/hadoop/core-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>fs.defaultFS</name> + <value>hdfs://localhost:9000</value> + </property> +</configuration> +EOF + +cat <<EOF > $HADOOP_HOME/etc/hadoop/yarn-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>yarn.nodemanager.aux-services</name> + <value>mapreduce_shuffle</value> + </property> +</configuration> +EOF + +# optionally stop previous clusters if any +#$HADOOP_HOME/sbin/stop-dfs.sh +#$HADOOP_HOME/sbin/stop-yarn.sh + +hdfs namenode -format + +$HADOOP_HOME/sbin/start-dfs.sh +$HADOOP_HOME/sbin/start-yarn.sh + Review Comment: can we start historyserver as well. else once the Tez job is done you can't navigate to it via resourcemanager UI ``` $HADOOP_HOME/bin/mapred --daemon start historyserver ``` ########## dev-support/bin/tez_run_example.sh: ########## @@ -0,0 +1,119 @@ + +# This script is used to set up a local Hadoop and Tez environment for running a simple word count example. +# Prerequisites +# 1. java is installed and JAVA_HOME is set +# 2. ssh localhost works without password + +# configure this if needed, by default it will use the latest stable versions in the current directory +export TEZ_VERSION=$(curl -s "https://downloads.apache.org/tez/" | grep -oP '\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 0.10.4 +export HADOOP_VERSION=$(curl -s "https://downloads.apache.org/hadoop/common/" | grep -oP 'hadoop-\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 3.4.1 +export HADOOP_STACK_HOME=$PWD + +echo "Demo script is running in $HADOOP_STACK_HOME with TEZ version $TEZ_VERSION and HADOOP version $HADOOP_VERSION" + +cd $HADOOP_STACK_HOME +wget -nc https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz +wget -nc https://archive.apache.org/dist/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz + +if [ ! -d "hadoop-$HADOOP_VERSION" ]; then + tar -xzf hadoop-$HADOOP_VERSION.tar.gz +fi + +if [ ! -d "apache-tez-$TEZ_VERSION-bin" ]; then + tar -xzf apache-tez-$TEZ_VERSION-bin.tar.gz +fi + +ln -s hadoop-$HADOOP_VERSION hadoop +ln -s apache-tez-$TEZ_VERSION-bin tez + +export HADOOP_HOME=$HADOOP_STACK_HOME/hadoop +export TEZ_HOME=$HADOOP_STACK_HOME/tez +export HADOOP_CLASSPATH=$TEZ_HOME/*:$TEZ_HOME/lib/*:$TEZ_HOME/conf + +export PATH=$PATH:$HADOOP_HOME/bin + +# https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation +cat <<EOF > $HADOOP_HOME/etc/hadoop/hdfs-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>dfs.replication</name> + <value>1</value> + </property> +</configuration> +EOF + +cat <<EOF > $HADOOP_HOME/etc/hadoop/core-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>fs.defaultFS</name> + <value>hdfs://localhost:9000</value> + </property> +</configuration> +EOF + +cat <<EOF > $HADOOP_HOME/etc/hadoop/yarn-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>yarn.nodemanager.aux-services</name> + <value>mapreduce_shuffle</value> + </property> +</configuration> +EOF + +# optionally stop previous clusters if any +#$HADOOP_HOME/sbin/stop-dfs.sh +#$HADOOP_HOME/sbin/stop-yarn.sh + +hdfs namenode -format + +$HADOOP_HOME/sbin/start-dfs.sh +$HADOOP_HOME/sbin/start-yarn.sh + +hadoop fs -mkdir /apps/ +hadoop fs -mkdir /apps/tez-$TEZ_VERSION +hadoop fs -copyFromLocal $TEZ_HOME/share/tez.tar.gz /apps/tez-$TEZ_VERSION + +# create a simple tez-site.xml +cat <<EOF > $TEZ_HOME/conf/tez-site.xml +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>tez.lib.uris</name> + <value>/apps/tez-$TEZ_VERSION/tez.tar.gz</value> + </property> +</configuration> +EOF + +# create a simple input file +cat <<EOF > ./words.txt +Apple +Banana +Car +Apple +Banana +Car +Dog +Elephant +Friend +Game +EOF + +hadoop fs -copyFromLocal words.txt /words.txt + +# finally run the example +hadoop jar $TEZ_HOME/tez-examples-$TEZ_VERSION.jar orderedwordcount /words.txt /words_out Review Comment: can we do yarn jar instead ########## dev-support/bin/tez_run_example.sh: ########## @@ -0,0 +1,119 @@ + +# This script is used to set up a local Hadoop and Tez environment for running a simple word count example. +# Prerequisites +# 1. java is installed and JAVA_HOME is set +# 2. ssh localhost works without password + +# configure this if needed, by default it will use the latest stable versions in the current directory +export TEZ_VERSION=$(curl -s "https://downloads.apache.org/tez/" | grep -oP '\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 0.10.4 +export HADOOP_VERSION=$(curl -s "https://downloads.apache.org/hadoop/common/" | grep -oP 'hadoop-\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 3.4.1 +export HADOOP_STACK_HOME=$PWD + +echo "Demo script is running in $HADOOP_STACK_HOME with TEZ version $TEZ_VERSION and HADOOP version $HADOOP_VERSION" + +cd $HADOOP_STACK_HOME +wget -nc https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz +wget -nc https://archive.apache.org/dist/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz + Review Comment: is there some caching possible? like if it is already present we don't download it again -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@tez.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org