Hi you all,

this is my first installation of Hadoop HA with qjs and I'm having a lot of troubles from at least one whole week.


The lab is setup as follow

10.0.0.10 zoo1 solr1 had1
10.0.0.11 zoo2 solr2 had2
10.0.0.12 zoo3 solr3 had3
10.0.0.15                  had4


*.10, *.11, *.12 are the zookeeper cluster

*.10 and *.15 work as hadoop leader namenodes , while *.11 and *.12 are the datanodes

*.10, *.11, *.12 are the solr cluster


zookeeper and solr clusters work fine together but when I tried to set up hadoop to put the solr indexes on,

everything went wrong


please find atttached the recipe I wrote and the logs I got, ANY help will be truly appreciated and if you'll come along in Italy I'll surely offer a dinner :)


###############################

RECIPE

###############################
### all  vms
ID=hadoop
useradd -m -d /home/$ID -s /bin/bash $ID -k /etc/skel/
chown $ID:$ID -R /home/$ID
usermod -aG installers hadoop

echo "$ID:D1x2j13%%2nk4" | chpasswd
mkdir -p /data/hadoop/hdfs/namenode /data/hadoop/hdfs/datanode
chown root /data/
chgrp installers /data/
chmod g+rw /data/
chown hadoop:hadoop /data/hadoop -R
echo 'DATA_DIR=/data/hadoop' >> ~/.bashrc
source ~/.bashrc
mkdir -p ~/.ssh
sudo -u hadoop echo '45=+FG76%&df@' > /home/hadoop/http-signature.secret

### namenodes

cd /opt/
HADOOP=hadoop-3.4.0.tar.gz
curl -O https://dlcdn.apache.org/hadoop/common/hadoop-3.4.0/$HADOOP
tar xvzf $HADOOP
ln -sf ./hadoop-3.4.0 ./hadoop
chown -R hadoop:hadoop /opt/hadoop /opt/hadoop-3.4.0 /data/hadoop $HADOOP
su - $ID
mkdir -p ~/.ssh
ssh-keygen -t rsa
cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys
DATA_DIR=/data/hadoop
DIRS='data,name,pid,tmp,yarn-local_dir,journal/node/local/data'

for IP in 10.0.0.{11..12} ; do ssh-copy-id  -o StrictHostKeyChecking=accept-new -i ~/.ssh/id_rsa.pub $IP; scp -r ./.ssh/id_rs* $IP:/home/hadoop/.ssh/; chmod 0600 ~/.ssh/authorized_keys ; done for IP in 10.0.0.{11..12} ; do scp -r /opt/$HADOOP $IP:/opt/ && ssh $IP "cd /opt/ && tar xvzf $HADOOP && ln -sf ./hadoop-3.4.0 ./hadoop" ; done for IP in 10.0.0.{10..12} ; do ssh -o StrictHostKeyChecking=accept-new $IP "mkdir -p $DATA_DIR/{$DIRS}" ; done

echo -e "export JAVA_HOME=$(readlink -f /usr/bin/javac | sed "s:bin/javac::")" > /etc/profile.d/hadoop

cat > /opt/hadoop/etc/hadoop/workers << EOF
had3
had4
EOF

--> both users  root and hadoop

echo -e 'export JAVA_HOME=$(readlink -f /usr/bin/javac | sed "s:bin/javac::")
export PDSH_RCMD_TYPE=ssh
export HADOOP_HOME=/opt/hadoop
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export HADOOP_INSTALL=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export HADOOP_YARN_HOME=$HADOOP_HOME
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HDFS_NAMENODE_USER="hadoop"
export HDFS_DATANODE_USER="hadoop"
export HDFS_SECONDARYNAMENODE_USER="hadoop"
export HDFS_JOURNALNODE_USER="hadoop"
export YARN_RESOURCEMANAGER_USER="hadoop"
export YARN_NODEMANAGER_USER="hadoop"
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native
export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin' >> ~/.bashrc

echo -e 'export HADOOP_OS_TYPE=${HADOOP_OS_TYPE:-$(uname -s)}
export HADOOP_PID_DIR=/data/hadoop/pid
export HDFS_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS"
export HDFS_DATANODE_USER=hadoop
export HDFS_NAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"
export HDFS_NAMENODE_USER=hadoop
export HDFS_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"
export HDFS_SECONDARYNAMENODE_USER=hadoop
export HDFS_JOURNALNODE_USER=hadoop
export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64/
export LANG=en_US.UTF-8
export YARN_NODEMANAGER_OPTS="$YARN_NODEMANAGER_OPTS -Djava.security.manager=allow -Djava.net.preferIPv4Stack=true --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED"
export YARN_NODEMANAGER_USER=hadoop
export YARN_RESOURCEMANAGER_OPTS="$YARN_RESOURCEMANAGER_OPTS -Djava.security.manager=allow -Djava.net.preferIPv4Stack=true --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED" export YARN_RESOURCEMANAGER_USER=hadoop' > /opt/hadoop/etc/hadoop/hadoop-env.sh

cat > /opt/hadoop/etc/hadoop/core-site.xml << EOF
<configuration>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://innen</value>
  </property>
  <property>
    <name>hadoop.tmp.dir</name>
    <value>/data/hadoop/tmp/</value>
  </property>
  <property>
    <name>io.file.buffer.size</name>
    <value>131072</value>
  </property>
<property>
<name>hadoop.http.authentication.signature.secret.file</name>
    <value>/home/hadoop/http-signature.secret</value>
</property>
<property>
    <name>ha.zookeeper.quorum</name>
    <value>had1:2181,had2:2181,had3:2181</value>
</property>
</configuration>
EOF

cat > /opt/hadoop/etc/hadoop/hdfs-site.xml << EOF
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>3</value>
    </property>
    <property>
        <name>dfs.namenode.name.dir</name>
<value>file:///data/hadoop/hdfs/namenode</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
<value>file:///data/hadoop/hdfs/datanode</value>
    </property>
    <property>
        <name>ha.zookeeper.quorum</name>
        <value>had1:2181,had2:2181,had3:2181</value>
    </property>
<!-- start cluster section -->
    <!-- definisco nome cluster e namenodes -->
        <property>
            <name>dfs.nameservices</name>
            <value>innen</value>
        </property>
        <property>
            <name>dfs.ha.namenodes.innen</name>
            <value>had1,had4</value>
        </property>
        <property>
<name>dfs.namenode.rpc-address.innen.had1</name>
            <value>had1:8020</value>
        </property>
        <property>
<name>dfs.namenode.rpc-address.innen.had4</name>
            <value>had4:8020</value>
        </property>
    <!-- definisco namenode fqdn -->
        <property>
<name>dfs.namenode.http-address.innen.had1</name>
            <value>had1:9870</value>
        </property>
        <property>
<name>dfs.namenode.http-address.innen.had4</name>
            <value>had4:9870</value>
        </property>
    <!-- URI del gruppo di journalnodes dove i namenodes +rw i log -->
        <property>
            <name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://had1:8485;had4:8485/innen</value>
        </property>
    <!-- determina il failover -->
        <property>
<name>dfs.client.failover.proxy.provider.innen</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
        </property>
    <!-- definisce il fencing, sdraia tutto via ssh -->
        <property>
            <name>dfs.ha.fencing.methods</name>
            <value>sshfence</value>
        </property>
        <property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
            <value>/home/hadoop/.ssh/id_rsa</value>
        </property>
    <!-- path assoluta dove i jn scrivono -->
        <property>
            <name>dfs.journalnode.edits.dir</name>
<value>/data/hadoop/journal/node/local/data</value>
        </property>
<!-- end cluster section -->
</configuration>
EOF

echo -e 'export YARN_RESOURCEMANAGER_OPTS="$YARN_RESOURCEMANAGER_OPTS -Djava.security.manager=allow -Djava.net.preferIPv4Stack=true --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED" export YARN_NODEMANAGER_OPTS="$YARN_NODEMANAGER_OPTS -Djava.security.manager=allow -Djava.net.preferIPv4Stack=true --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED"' >> /opt/hadoop/etc/hadoop/yarn-env.sh

cat > /opt/hadoop/etc/hadoop/yarn-site.xml << EOF
<configuration>
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>had1</value>
    </property>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
<name>yarn.nodemanager.disk-health-checker.min-healthy-disks</name>
        <value>0</value>
    </property>
    <property>
        <name>yarn.nodemanager.local-dirs</name>
        <value>/data/hadoop/yarn-local_dir</value>
    </property>
</configuration>
EOF

### on the  journalnodes

hdfs --daemon start journalnode

### sul node main master

/opt/hadoop/bin/hdfs namenode -format  <-- just once

        --> $HADOOP_HOME/sbin/start-all.sh <--

###############################################



one of the errors I get is


java.net.ConnectException: Call From controlplane/127.0.2.1 to had4:8020 failed on connection exception: java.net.ConnectException: Connection refused; For more details see:  http://wiki.apache.org/hadoop/ConnectionRefused     at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)     at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)     at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)     at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)     at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
    at org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:948)
    at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:863)
    at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1588)
    at org.apache.hadoop.ipc.Client.call(Client.java:1529)
    at org.apache.hadoop.ipc.Client.call(Client.java:1426)
    at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:258)     at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:139)
    at jdk.proxy2/jdk.proxy2.$Proxy17.rollEditLog(Unknown Source)
    at org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB.lambda$rollEditLog$4(NamenodeProtocolTranslatorPB.java:139)     at org.apache.hadoop.ipc.internal.ShadedProtobufHelper.ipc(ShadedProtobufHelper.java:160)     at org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB.rollEditLog(NamenodeProtocolTranslatorPB.java:139)     at org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$2.doWork(EditLogTailer.java:419)     at org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$2.doWork(EditLogTailer.java:414)     at org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$MultipleNameNodeProxy.call(EditLogTailer.java:598)
    at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)     at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
    at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.net.ConnectException: Connection refused
    at java.base/sun.nio.ch.Net.pollConnect(Native Method)
    at java.base/sun.nio.ch.Net.pollConnectNow(Net.java:672)
    at java.base/sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:946)     at org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:205)
    at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:601)
    at org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:668)     at org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:789)
    at org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:364)
    at org.apache.hadoop.ipc.Client.getConnection(Client.java:1649)
    at org.apache.hadoop.ipc.Client.call(Client.java:1473)
    ... 14 more
2024-08-09 14:10:01,808 WARN org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Have reached the max loop count (3). 2024-08-09 14:10:01,808 WARN org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Unable to trigger a roll of the active NN java.util.concurrent.ExecutionException: java.io.IOException: Cannot find any valid remote NN to service request!     at java.base/java.util.concurrent.FutureTask.report(FutureTask.java:122)
    at java.base/java.util.concurrent.FutureTask.get(FutureTask.java:205)
    at org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer.triggerActiveLogRoll(EditLogTailer.java:433)     at org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.doWork(EditLogTailer.java:511)     at org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.access$700(EditLogTailer.java:477)     at org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread$1.run(EditLogTailer.java:494)     at org.apache.hadoop.security.SecurityUtil.doAsLoginUserOrFatal(SecurityUtil.java:520)     at org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$EditLogTailerThread.run(EditLogTailer.java:490) Caused by: java.io.IOException: Cannot find any valid remote NN to service request!     at org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$MultipleNameNodeProxy.call(EditLogTailer.java:609)
    at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)     at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
    at java.base/java.lang.Thread.run(Thread.java:840)

Actually the 8020 is open only on had1 but not on had4


and I get floods of these logs


000 MILLISECONDS)
2024-08-09 14:11:19,893 INFO org.apache.hadoop.ipc.Client: Retrying connect to server: had4/10.0.0.15:8020. Already tried 7 time(s); retry policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS) 2024-08-09 14:11:20,898 INFO org.apache.hadoop.ipc.Client: Retrying connect to server: had4/10.0.0.15:8020. Already tried 8 time(s); retry policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS) 2024-08-09 14:11:21,901 INFO org.apache.hadoop.ipc.Client: Retrying connect to server: had4/10.0.0.15:8020. Already tried 9 time(s); retry policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS) 2024-08-09 14:11:21,903 WARN org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer: Exception from remote name node RemoteNameNodeInfo [nnId=had4, ipcAddress=had4/10.0.0.15:8020, httpAddress=http://had4:9870], try next. java.net.ConnectException: Call From controlplane/127.0.2.1 to had4:8020 failed on connection exception: java.net.ConnectException: Connection refused; For more details see:  http://wiki.apache.org/hadoop/ConnectionRefused


moreover if I try to list the fs with


hdfs dfs -ls /


I only get these errors

2024-08-09 14:12:16,117 INFO retry.RetryInvocationHandler: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error     at org.apache.hadoop.hdfs.server.namenode.ha.StandbyState.checkOperation(StandbyState.java:108)     at org.apache.hadoop.hdfs.server.namenode.NameNode$NameNodeHAContext.checkOperation(NameNode.java:2255)     at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkOperation(FSNamesystem.java:1628)     at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getFileInfo(FSNamesystem.java:3464)     at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getFileInfo(NameNodeRpcServer.java:1229)     at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getFileInfo(ClientNamenodeProtocolServerSideTranslatorPB.java:1052)     at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)     at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:621)     at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:589)     at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:573)
    at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1227)
    at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1246)
    at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1169)
    at java.base/java.security.AccessController.doPrivileged(AccessController.java:712)
    at java.base/javax.security.auth.Subject.doAs(Subject.java:439)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1953)
    at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3203)
, while invoking ClientNamenodeProtocolTranslatorPB.getFileInfo over had1/10.0.0.10:8020 after 2 failover attempts. Trying to failover after sleeping for 2639ms. Current retry count: 2. 2024-08-09 14:12:18,759 INFO retry.RetryInvocationHandler: java.net.ConnectException: Call From controlplane/127.0.2.1 to had4:8020 failed on connection exception: java.net.ConnectException: Connection refused; For more details see:  http://wiki.apache.org/hadoop/ConnectionRefused, while invoking ClientNamenodeProtocolTranslatorPB.getFileInfo over had4/10.0.0.15:8020 after 3 failover attempts. Trying to failover after sleeping for 3943ms. Current retry count: 3. 2024-08-09 14:12:22,706 INFO retry.RetryInvocationHandler: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error     at org.apache.hadoop.hdfs.server.namenode.ha.StandbyState.checkOperation(StandbyState.java:108)     at org.apache.hadoop.hdfs.server.namenode.NameNode$NameNodeHAContext.checkOperation(NameNode.java:2255)     at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkOperation(FSNamesystem.java:1628)     at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getFileInfo(FSNamesystem.java:3464)     at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getFileInfo(NameNodeRpcServer.java:1229)     at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getFileInfo(ClientNamenodeProtocolServerSideTranslatorPB.java:1052)     at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)     at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:621)     at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:589)     at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:573)
    at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1227)
    at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1246)
    at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1169)
    at java.base/java.security.AccessController.doPrivileged(AccessController.java:712)
    at java.base/javax.security.auth.Subject.doAs(Subject.java:439)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1953)
    at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3203)
, while invoking ClientNamenodeProtocolTranslatorPB.getFileInfo over had1/10.0.0.10:8020 after 4 failover attempts. Trying to failover after sleeping for 10146ms. Current retry count: 4.

I'm really out of ideas


Thanks in advance,

Sincerely

Roberto


---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscr...@hadoop.apache.org
For additional commands, e-mail: user-h...@hadoop.apache.org

Reply via email to