jiangshouzhuang created HDDS-6442:
-------------------------------------
Summary: SCM HA on Kubernetes failed
Key: HDDS-6442
URL: https://issues.apache.org/jira/browse/HDDS-6442
Project: Apache Ozone
Issue Type: Bug
Components: SCM
Affects Versions: 1.2.1
Environment: The environment information is as follows:
1. Kubernetes: RK2 deployment K8s environment
Rke2 version v1.21.9 + rke2r1
2. Ozone version: 1.2.1
Reporter: jiangshouzhuang
Using Kubernetes to deploy the SCM HA environment, the test process is as
follows
h2. # K8s yaml
config-configmap.yaml:
{code:java}
apiVersion: v1
kind: ConfigMap
metadata:
name: config
data:
CORE-SITE.XML_fs.defaultFS: ofs://cluster1/
OZONE-SITE.XML_ozone.om.ratis.enable: "true"
OZONE-SITE.XML_ozone.om.service.ids: cluster1
OZONE-SITE.XML_ozone.om.nodes.cluster1: om0,om1,om2
OZONE-SITE.XML_ozone.om.address.cluster1.om0:
om-0.om.default.svc.cluster.local
OZONE-SITE.XML_ozone.om.address.cluster1.om1:
om-1.om.default.svc.cluster.local
OZONE-SITE.XML_ozone.om.address.cluster1.om2:
om-2.om.default.svc.cluster.local
OZONE-SITE.XML_ozone.scm.ratis.enable: "true"
OZONE-SITE.XML_ozone.scm.service.ids: cluster1
OZONE-SITE.XML_ozone.scm.nodes.cluster1: scm0,scm1,scm2
OZONE-SITE.XML_ozone.scm.address.cluster1.scm0:
scm-0.scm.default.svc.cluster.local
OZONE-SITE.XML_ozone.scm.address.cluster1.scm1:
scm-1.scm.default.svc.cluster.local
OZONE-SITE.XML_ozone.scm.address.cluster1.scm2:
scm-2.scm.default.svc.cluster.local
OZONE-SITE.XML_ozone.scm.primordial.node.id:
scm-0.scm.default.svc.cluster.local
OZONE-SITE.XML_ozone.scm.block.client.address:
scm-0.scm.default.svc.cluster.local
OZONE-SITE.XML_ozone.scm.client.address: scm-0.scm.default.svc.cluster.local
OZONE-SITE.XML_hdds.scm.safemode.min.datanode: "3"
OZONE-SITE.XML_ozone.scm.names:
scm-0.scm.default.svc.cluster.local,scm-1.scm.default.svc.cluster.local,scm-2.scm.default.svc.cluster.local
OZONE-SITE.XML_ozone.metadata.dirs: /data/metadata
OZONE-SITE.XML_ozone.om.db.dirs: /data/om-db/data
OZONE-SITE.XML_ozone.scm.db.dirs: /data/scm-db/data
OZONE-SITE.XML_dfs.container.ratis.datanode.storage.dir:
/data/datanode/ratis/data
#OZONE-SITE.XML_hdds.datanode.dir: /data/datanode/storage
OZONE-SITE.XML_ozone.scm.datanode.id.dir: /data/datanode
OZONE-SITE.XML_ozone.scm.container.size: 1GB
OZONE-SITE.XML_ozone.scm.block.size: "128MB"
OZONE-SITE.XML_ozone.scm.datanode.ratis.volume.free-space.min: 10MB
OZONE-SITE.XML_ozone.recon.om.db.dir: /data/recon/om
OZONE-SITE.XML_ozone.recon.scm.db.dirs: /data/recon/scm
OZONE-SITE.XML_ozone.om.http-address: om-0.om.default.svc.cluster.local:9874
OZONE-SITE.XML_ozone.recon.http-address:
recon-0.recon.default.svc.cluster.local:19888
OZONE-SITE.XML_ozone.recon.address:
recon-0.recon.default.svc.cluster.local:9891
LOG4J.PROPERTIES_log4j.rootLogger: INFO, stdout
LOG4J.PROPERTIES_log4j.appender.stdout: org.apache.log4j.ConsoleAppender
LOG4J.PROPERTIES_log4j.appender.stdout.layout: org.apache.log4j.PatternLayout
LOG4J.PROPERTIES_log4j.appender.stdout.layout.ConversionPattern:
'%d{yyyy-MM-dd
HH:mm:ss} %-5p %c{1}:%L - %m%n' {code}
scm-statefulset.yaml
{code:java}
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: scm
labels:
app.kubernetes.io/component: ozone
spec:
selector:
matchLabels:
app: ozone
component: scm
serviceName: scm
replicas: 3
template:
metadata:
labels:
app: ozone
component: scm
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9876"
prometheus.io/path: /prom
spec:
securityContext:
fsGroup: 1000
initContainers:
- name: init
image: 'apache/ozone:1.2.1'
args:
- ozone
- scm
- --init
envFrom:
- configMapRef:
name: config
volumeMounts:
- name: data
mountPath: /data/scm-db/data
- name: bootstrap
image: 'apache/ozone:1.2.1'
args:
- ozone
- scm
- --bootstrap
envFrom:
- configMapRef:
name: config
volumeMounts:
- name: data
mountPath: /data/scm-db/data
containers:
- name: scm
image: 'apache/ozone:1.2.1'
args:
- ozone
- scm
livenessProbe:
tcpSocket:
port: 9861
initialDelaySeconds: 30
envFrom:
- configMapRef:
name: config
volumeMounts:
- name: data
mountPath: /data/scm-db/data
volumeClaimTemplates:
- metadata:
name: data
spec:
storageClassName: local-storage
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 2Gi {code}
scm-service.yaml:
{code:java}
apiVersion: v1
kind: Service
metadata:
name: scm
spec:
ports:
- port: 9876
name: ui
clusterIP: None
selector:
app: ozone
component: scm{code}
h2. # Deploy yaml
{code:java}
kubectl apply -f config-configmap.yaml
kubectl apply -f scm-statefulset.yaml
kubectl apply -f scm-service.yaml{code}
Check Pod running status:
{code:java}
[root@k8s-master01 scm]# kubectl get pods
NAME READY STATUS RESTARTS AGE
centos 1/1 Running 0 3d6h
scm-0 1/1 Running 0 3m30s
scm-1 0/1 Init:1/2 0 3m16s{code}
scm-0 log information:
kubectl logs -f scm-0 init
{code:java}
....
2022-03-13 13:04:41 INFO StateMachineUpdater:281 -
63be5cb3-5b28-46b1-9d95-59f26dd08ca7@group-BE50E0079689-StateMachineUpdater:
Took a snapshot at index 0
2022-03-13 13:04:41 INFO StateMachineUpdater:89 -
63be5cb3-5b28-46b1-9d95-59f26dd08ca7@group-BE50E0079689-StateMachineUpdater:
snapshotIndex: updateIncreasingly -1 -> 0
2022-03-13 13:04:42 INFO RaftServer$Division:419 -
63be5cb3-5b28-46b1-9d95-59f26dd08ca7@group-BE50E0079689: closes. applyIndex: 0
2022-03-13 13:04:42 INFO SegmentedRaftLogWorker:327 -
63be5cb3-5b28-46b1-9d95-59f26dd08ca7@group-BE50E0079689-SegmentedRaftLogWorker
was interrupted, exiting. There are 0 tasks remaining in the queue.
2022-03-13 13:04:42 INFO SegmentedRaftLogWorker:237 -
63be5cb3-5b28-46b1-9d95-59f26dd08ca7@group-BE50E0079689-SegmentedRaftLogWorker
close()
2022-03-13 13:04:42 INFO GrpcService:262 -
63be5cb3-5b28-46b1-9d95-59f26dd08ca7: shutdown server with port 9894 now
2022-03-13 13:04:42 INFO GrpcService:271 -
63be5cb3-5b28-46b1-9d95-59f26dd08ca7: shutdown server with port 9894
successfully
2022-03-13 13:04:42 INFO JvmPauseMonitor:109 -
JvmPauseMonitor-63be5cb3-5b28-46b1-9d95-59f26dd08ca7: Stopped
2022-03-13 13:04:42 INFO StorageContainerManager:1083 - SCM initialization
succeeded. Current cluster id for sd=/data/scm-db/data/scm;
cid=CID-93897332-2d9e-4c35-b96e-be50e0079689; layoutVersion=2;
scmId=63be5cb3-5b28-46b1-9d95-59f26dd08ca7
2022-03-13 13:04:42 INFO StorageContainerManagerStarter:124 - SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down StorageContainerManager at scm-0/10.42.4.196
************************************************************/{code}
As you can see init executed successfully, look at the SCM startup log:
kubectl logs -f scm-0
{code:java}
...
2022-03-13 13:04:48 INFO OneReplicaPipelineSafeModeRule:180 - Total pipeline
count is 0, pipeline's with at least one datanode reported threshold count is 0
2022-03-13 13:04:48 WARN NativeCodeLoader:60 - Unable to load native-hadoop
library for your platform... using builtin-java classes where applicable
2022-03-13 13:04:49 INFO CallQueueManager:90 - Using callQueue: class
java.util.concurrent.LinkedBlockingQueue, queueCapacity: 10000, scheduler:
class org.apache.hadoop.ipc.DefaultRpcScheduler, ipcBackoff: false.
2022-03-13 13:04:49 INFO Server:1265 - Starting Socket Reader #1 for port 9861
2022-03-13 13:04:49 INFO CallQueueManager:90 - Using callQueue: class
java.util.concurrent.LinkedBlockingQueue, queueCapacity: 10000, scheduler:
class org.apache.hadoop.ipc.DefaultRpcScheduler, ipcBackoff: false.
2022-03-13 13:04:49 INFO Server:1265 - Starting Socket Reader #1 for port 9863
2022-03-13 13:04:49 INFO CallQueueManager:90 - Using callQueue: class
java.util.concurrent.LinkedBlockingQueue, queueCapacity: 10000, scheduler:
class org.apache.hadoop.ipc.DefaultRpcScheduler, ipcBackoff: false.
2022-03-13 13:04:49 INFO Server:1265 - Starting Socket Reader #1 for port 9860
2022-03-13 13:04:49 INFO ContainerPlacementPolicyFactory:60 - Create container
placement policy of type
org.apache.hadoop.hdds.scm.container.placement.algorithms.SCMContainerPlacementRandom
2022-03-13 13:04:49 INFO StorageContainerManager:388 -
Container Balancer status:
Key Value
Running false
Container Balancer Configuration values:
Key Value
Threshold 0.1
Max Datanodes to Involve per Iteration(ratio) 0.2
Max Size to Move per Iteration 32212254720B
2022-03-13 13:04:49 WARN EventQueue:194 - No event handler registered for
event TypedEvent{payloadType=SafeModeStatus, name='Safe mode status'}
2022-03-13 13:04:49 INFO SCMContext:205 - Update SafeModeStatus from
SafeModeStatus{safeModeStatus=true, preCheckPassed=false} to
SafeModeStatus{safeModeStatus=true, preCheckPassed=false}.
2022-03-13 13:04:49 INFO StorageContainerManager:1303 -
StorageContainerLocationProtocol RPC server is listening at /0.0.0.0:9860
2022-03-13 13:04:49 INFO SCMRatisServerImpl:177 - starting ratis server
0.0.0.0:9894
2022-03-13 13:04:49 INFO RaftServer$Division:304 -
63be5cb3-5b28-46b1-9d95-59f26dd08ca7@group-BE50E0079689: start with
initializing state, conf=-1: [], old=null
2022-03-13 13:04:49 INFO RaftServer$Division:290 -
63be5cb3-5b28-46b1-9d95-59f26dd08ca7@group-BE50E0079689: changes role from
null to FOLLOWER at term 0 for startInitializing
2022-03-13 13:04:49 INFO JmxRegister:44 - Successfully registered JMX Bean
with object name
Ratis:service=RaftServer,group=group-BE50E0079689,id=63be5cb3-5b28-46b1-9d95-59f26dd08ca7
2022-03-13 13:04:49 INFO RaftServer:374 -
63be5cb3-5b28-46b1-9d95-59f26dd08ca7: start RPC server
2022-03-13 13:04:49 INFO GrpcService:253 -
63be5cb3-5b28-46b1-9d95-59f26dd08ca7: GrpcService started, listening on 9894
2022-03-13 13:04:49 INFO JvmPauseMonitor:103 -
JvmPauseMonitor-63be5cb3-5b28-46b1-9d95-59f26dd08ca7: Started
2022-03-13 13:04:49 INFO SCMNodeInfo:197 - ConfigKey ozone.scm.client.address
is deprecated, For configuring different ports for each SCM use PortConfigKey
ozone.scm.client.port appended with serviceId and nodeId
2022-03-13 13:04:49 INFO SCMNodeInfo:197 - ConfigKey
ozone.scm.block.client.address is deprecated, For configuring different ports
for each SCM use PortConfigKey ozone.scm.block.client.port appended with
serviceId and nodeId
2022-03-13 13:04:49 INFO SCMNodeInfo:197 - ConfigKey
ozone.scm.datanode.address is deprecated, For configuring different ports for
each SCM use PortConfigKey ozone.scm.datanode.port appended with serviceId and
nodeId
2022-03-13 13:04:49 WARN SCMProxyInfo:47 - SCM address
scm-1.scm.default.svc.cluster.local:9863 for serviceID cluster1 remains
unresolved for node ID scm1 Check your ozone-site.xml file to ensure scm
addresses are configured properly.
2022-03-13 13:04:49 WARN SCMProxyInfo:47 - SCM address
scm-2.scm.default.svc.cluster.local:9863 for serviceID cluster1 remains
unresolved for node ID scm2 Check your ozone-site.xml file to ensure scm
addresses are configured properly.
2022-03-13 13:09:48 INFO ReplicationManager:371 - Replication Monitor Thread
took 0 milliseconds for processing 0 containers.
.....{code}
Again, scm-1 logs:
kubectl logs -f scm-1 bootstrap
{code:java}
......
************************************************************/ 2022-03-13
13:04:51 INFO StorageContainerManagerStarter:90 - registered UNIX signal
handlers for [TERM, HUP, INT]
2022-03-13 13:04:51 INFO SCMHANodeDetails:157 - ServiceID for
StorageContainerManager is null
2022-03-13 13:04:51 INFO SCMHANodeDetails:162 - ozone.scm.default.service.id
is not defined, falling back to ozone.scm.service.ids to find serviceID for
StorageContainerManager if it is HA enabled cluster
2022-03-13 13:04:51 ERROR SCMHANodeDetails:231 - Address for SCM scm2 :
scm-2.scm.default.svc.cluster.local couldn't be resolved. Proceeding with
unresolved host to create Ratis ring.
2022-03-13 13:04:51 INFO SCMHANodeDetails:250 - Found matching SCM address
with SCMServiceId: cluster1, SCMNodeId: scm1, RPC Address:
scm-1.scm.default.svc.cluster.local:9894 and Ratis port: 9894
2022-03-13 13:04:51 INFO SCMHANodeDetails:108 - Setting configuration key
ozone.scm.address with value of key ozone.scm.address.cluster1.scm1:
scm-1.scm.default.svc.cluster.local
2022-03-13 13:04:51 WARN NativeCodeLoader:60 - Unable to load native-hadoop
library for your platform... using builtin-java classes where applicable
2022-03-13 13:04:51 WARN SCMProxyInfo:47 - SCM address
scm-2.scm.default.svc.cluster.local:9863 for serviceID cluster1 remains
unresolved for node ID scm2 Check your ozone-site.xml file to ensure scm
addresses are configured properly.
2022-03-13 13:19:54 INFO RetryInvocationHandler:411 -
com.google.protobuf.ServiceException: java.net.ConnectException: Call From
scm-1/10.42.3.178 to scm-1.scm.default.svc.cluster.local:9863 failed on
connection exception: java.net.ConnectException: Connection refused; For more
details see: http://wiki.apache.org/hadoop/ConnectionRefused, while invoking
$Proxy14.send over
nodeId=scm1,nodeAddress=scm-1.scm.default.svc.cluster.local/10.42.3.178:9863
after 1 failover attempts. Trying to failover after sleeping for 2000ms.
2022-03-13 13:19:56 INFO RetryInvocationHandler:411 -
com.google.protobuf.ServiceException: java.net.UnknownHostException: Invalid
host name: local host is: (unknown); destination host is:
"scm-2.scm.default.svc.cluster.local":9863; java.net.UnknownHostException; For
more details see: http://wiki.apache.org/hadoop/UnknownHost, while invoking
$Proxy14.send over
nodeId=scm2,nodeAddress=scm-2.scm.default.svc.cluster.local:9863 after 2
failover attempts. Trying to failover after sleeping for 2000ms.
{code}
scm-1 pod will be stuck in here can not run normally, access to the
scm-2.scm.default.svc.cluster.local. The local service is also impassability,
because scm-2 pod according to the rules of statefulset haven't scheduling.
--
This message was sent by Atlassian Jira
(v8.20.1#820001)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]