[ 
https://issues.apache.org/jira/browse/HDDS-6345?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17494258#comment-17494258
 ] 

Shawn commented on HDDS-6345:
-----------------------------

This is the OM statefulset spec with some security info removed:


{code:java}
// code placeholder
apiVersion: apps/v1
kind: StatefulSet
metadata:
  labels:
    app.kubernetes.io/instance: RELEASE-NAME
    app.kubernetes.io/name: ozone-om
  name: ozone-om-prod
  namespace: ozone-prod
spec:
  replicas: 3
  revisionHistoryLimit: 3
  serviceName: ozone-om
  updateStrategy:
    type: RollingUpdate
  selector:
    matchLabels:
        app.kubernetes.io/name: ozone-om
        app.kubernetes.io/instance: RELEASE-NAME
  volumeClaimTemplates:
  - metadata:
      name: om-data-pvc
    spec:
      accessModes:
        - ReadWriteOnce
      storageClassName: scaleio
      volumeMode: Filesystem
      resources:
        requests:
          storage: 200Gi
  template:
    metadata:
      labels:
        app.kubernetes.io/instance: RELEASE-NAME
        app.kubernetes.io/name: ozone-om
      name: ozone-om-prod
      namespace: ozone-prod
    spec:
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - podAffinityTerm:
              labelSelector:
                matchLabels:
                  app.kubernetes.io/name: ozone-om
              namespaces:
              - ozone-prod
              topologyKey: kubernetes.io/hostname
            weight: 100
      containers:
      - args:
        - /opt/launch.sh
        - om
        env:
        - name: KERBEROS_SERVER
          value: krb5-server-prod-kdc-hs
        - name: LOG4J.PROPERTIES_log4j.appender.rolling
          value: org.apache.log4j.RollingFileAppender
        - name: LOG4J.PROPERTIES_log4j.appender.rolling.File
          value: /var/log/hadoop/ozone.log
        - name: LOG4J.PROPERTIES_log4j.appender.rolling.MaxBackupIndex
          value: "30"
        - name: LOG4J.PROPERTIES_log4j.appender.rolling.MaxFileSize
          value: 100MB
        - name: LOG4J.PROPERTIES_log4j.appender.rolling.layout
          value: org.apache.log4j.PatternLayout
        - name: LOG4J.PROPERTIES_log4j.appender.rolling.layout.ConversionPattern
          value: '%d{yyyy-MM-dd HH:mm:ss} %-5p %t %c{1}:%L - %m%n'
        - name: LOG4J.PROPERTIES_log4j.appender.stdout
          value: org.apache.log4j.ConsoleAppender
        - name: LOG4J.PROPERTIES_log4j.appender.stdout.layout
          value: org.apache.log4j.PatternLayout
        - name: LOG4J.PROPERTIES_log4j.appender.stdout.layout.ConversionPattern
          value: '%d{yyyy-MM-dd HH:mm:ss} %-5p %t %c{1}:%L - %m%n'
        - name: LOG4J.PROPERTIES_log4j.rootLogger
          value: DEBUG, rolling, stdout
        - name: OZONE-SITE.XML_dfs.datanode.kerberos.principal
          value: dn/[email protected]
        - name: OZONE-SITE.XML_dfs.datanode.keytab.file
          value: /etc/security/keytabs/dn-dn.keytab
        - name: OZONE-SITE.XML_dfs.datanode.use.datanode.hostname
          value: "true"
        - name: OZONE-SITE.XML_hadoop.security.authentication
          value: kerberos
        - name: OZONE-SITE.XML_hdds.datanode.dir
          value: /data/storage
        - name: OZONE-SITE.XML_hdds.datanode.http.auth.kerberos.keytab
          value: /etc/security/keytabs/http-dn.keytab
        - name: OZONE-SITE.XML_hdds.datanode.http.auth.kerberos.principal
          value: HTTP/[email protected]
        - name: OZONE-SITE.XML_hdds.datanode.http.auth.type
          value: kerberos
        - name: OZONE-SITE.XML_hdds.scm.http.auth.kerberos.keytab
          value: /etc/security/keytabs/http-scm.keytab
        - name: OZONE-SITE.XML_hdds.scm.http.auth.kerberos.principal
          value: HTTP/[email protected]
        - name: OZONE-SITE.XML_hdds.scm.http.auth.type
          value: kerberos
        - name: OZONE-SITE.XML_hdds.scm.kerberos.keytab.file
          value: /etc/security/keytabs/scm-scm.keytab
        - name: OZONE-SITE.XML_hdds.scm.kerberos.principal
          value: scm/[email protected]
        - name: OZONE-SITE.XML_hdds.scm.safemode.min.datanode
          value: "3"
        - name: OZONE-SITE.XML_ozone.UnsafeByteOperations.enabled
          value: "false"
        - name: OZONE-SITE.XML_ozone.acl.authorizer.class
          value: org.apache.hadoop.ozone.security.acl.OzoneNativeAuthorizer
        - name: OZONE-SITE.XML_ozone.acl.enabled
          value: "true"
        - name: OZONE-SITE.XML_ozone.administrators
          value: 
testuser/[email protected],testuser/[email protected],recon/[email protected],om/[email protected]
        - name: OZONE-SITE.XML_ozone.csi.owner
          value: hadoop
        - name: OZONE-SITE.XML_ozone.csi.s3g.address
          value: http://ozone-s3g-prod-http:9878
        - name: OZONE-SITE.XML_ozone.csi.socket
          value: /var/lib/csi/csi.sock
        - name: OZONE-SITE.XML_ozone.flexible.fqdn.resolution.enabled
          value: "true"
        - name: OZONE-SITE.XML_ozone.http.filter.initializers
          value: org.apache.hadoop.security.AuthenticationFilterInitializer
        - name: OZONE-SITE.XML_ozone.jvm.network.address.cache.enabled
          value: "false"
        - name: OZONE-SITE.XML_ozone.metadata.dirs
          value: /data/metadata
        - name: OZONE-SITE.XML_ozone.om.address.ozone.om0
          value: ozone-om-prod-0.ozone-om-prod-om
        - name: OZONE-SITE.XML_ozone.om.address.ozone.om1
          value: ozone-om-prod-1.ozone-om-prod-om
        - name: OZONE-SITE.XML_ozone.om.address.ozone.om2
          value: ozone-om-prod-2.ozone-om-prod-om
        - name: OZONE-SITE.XML_ozone.om.http.auth.kerberos.keytab
          value: /etc/security/keytabs/http-om.keytab
        - name: OZONE-SITE.XML_ozone.om.http.auth.kerberos.principal
          value: HTTP/[email protected]
        - name: OZONE-SITE.XML_ozone.om.http.auth.type
          value: kerberos
        - name: OZONE-SITE.XML_ozone.om.kerberos.keytab.file
          value: /etc/security/keytabs/om-om.keytab
        - name: OZONE-SITE.XML_ozone.om.kerberos.principal
          value: om/[email protected]
        - name: OZONE-SITE.XML_ozone.om.nodes.ozone
          value: om0,om1,om2
        - name: OZONE-SITE.XML_ozone.om.ratis.enable
          value: "true"
        - name: OZONE-SITE.XML_ozone.om.service.ids
          value: ozone
        - name: OZONE-SITE.XML_ozone.recon.http.auth.kerberos.keytab
          value: /etc/security/keytabs/http-recon.keytab
        - name: OZONE-SITE.XML_ozone.recon.http.auth.kerberos.principal
          value: HTTP/[email protected]
        - name: OZONE-SITE.XML_ozone.recon.http.auth.type
          value: kerberos
        - name: OZONE-SITE.XML_ozone.s3g.http.auth.kerberos.keytab
          value: /etc/security/keytabs/http-s3g.keytab
        - name: OZONE-SITE.XML_ozone.s3g.http.auth.kerberos.principal
          value: HTTP/[email protected]
        - name: OZONE-SITE.XML_ozone.s3g.http.auth.type
          value: kerberos
        - name: OZONE-SITE.XML_ozone.s3g.kerberos.keytab.file
          value: /etc/security/keytabs/s3g-s3g.keytab
        - name: OZONE-SITE.XML_ozone.scm.address.scmservice.scm0
          value: ozone-scm-prod-0.ozone-scm-prod-rpc
        - name: OZONE-SITE.XML_ozone.scm.address.scmservice.scm1
          value: ozone-scm-prod-1.ozone-scm-prod-rpc
        - name: OZONE-SITE.XML_ozone.scm.address.scmservice.scm2
          value: ozone-scm-prod-2.ozone-scm-prod-rpc
        - name: OZONE-SITE.XML_ozone.scm.datanode.id.dir
          value: /data
        - name: OZONE-SITE.XML_ozone.scm.nodes.scmservice
          value: scm0,scm1,scm2
        - name: OZONE-SITE.XML_ozone.scm.primordial.node.id
          value: scm0
        - name: OZONE-SITE.XML_ozone.scm.ratis.enable
          value: "true"
        - name: OZONE-SITE.XML_ozone.scm.service.ids
          value: scmservice
        - name: OZONE-SITE.XML_ozone.security.enabled
          value: "true"
        - name: OZONE-SITE.XML_ozone.security.http.kerberos.enabled
          value: "true"
        - name: OZONE_OPTS
          value: -Xmx100G -Xms20G -XX:MaxDirectMemorySize=20G 
-XX:MaxMetaspaceSize=20G
            -XX:NativeMemoryTracking=summary 
-Xlog:gc=debug:file=/data/gc.log:pid,time,uptime,level,tags
            -agentpath:/data/libyjpagent.so=probe_disable=*,listen=localhost
        envFrom:
        - configMapRef:
            name: environment-configmap
        image: xyz.com/shawn/ozone:latest-dev
        imagePullPolicy: Always
        livenessProbe:
          exec:
            command:
            - /bin/bash
            - -c
            - ls
          failureThreshold: 6
          initialDelaySeconds: 120
          periodSeconds: 60
          successThreshold: 1
          timeoutSeconds: 2
        name: ozone-om
        ports:
        - containerPort: 8080
          name: app
        - containerPort: 9874
          name: http
        - containerPort: 9862
          name: om
        readinessProbe:
          exec:
            command:
            - /bin/bash
            - -c
            - ls
          failureThreshold: 3
          initialDelaySeconds: 60
          periodSeconds: 60
          successThreshold: 1
          timeoutSeconds: 2
        resources:
          limits:
            cpu: "6"
            ephemeral-storage: 50G
            memory: 300G
          requests:
            cpu: "3"
            ephemeral-storage: 50G
            memory: 300G
        volumeMounts:
        - mountPath: /data
          name: om-data-pvc
      priorityClassName: p1
      securityContext:
        fsGroup: 1600
        runAsUser: 1600
      serviceAccountName: ozone-om {code}
 

 

> OM always runs OOM in Kubernetes 
> ---------------------------------
>
>                 Key: HDDS-6345
>                 URL: https://issues.apache.org/jira/browse/HDDS-6345
>             Project: Apache Ozone
>          Issue Type: Bug
>            Reporter: Shawn
>            Priority: Major
>
> I deployed ozone 1.21 to kubernetes  with security enabled and with OM HA and 
> SCM HA. However, one of the OM always gets restarted by Kubernetes because of 
> OOM. Even I assigned 300GB memory, the OM still keeps restarting for OOM.
>  
> After analysis, we found the OOM was because of rocksDB. When OM gets 
> restarted, it first tries to open rocksDB. And during this time, rocksDB 
> tries to do compaction, which eventually got OOM. So there are three question:
>  
> 1. Why the OM got into this status?
> 2. Why rocksDB needs so much memory to do the compaction?
> 3. How to resolve this?
> Some info maybe useful for you. We directly deploy OM HA, not migrate from 
> one OM to HA OM. The OM that has issues is a follower, not a leader. The 
> underlying PVC we are using is SSD. Our traffic is mostly large objects, with 
> size of hundreds GBs.



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to