zwangsheng commented on PR #1804:
URL: 
https://github.com/apache/incubator-celeborn/pull/1804#issuecomment-1676929781

   > LGTM, was this patch tested? and should we update the docs accordingly?
   
   Local dry-run with `helm install celeborn-beta charts/celeborn --dry-run  `, 
following is output:
   ```yaml
   NAME: celeborn-beta
   LAST DEPLOYED: Mon Aug 14 15:16:23 2023
   NAMESPACE: default
   STATUS: pending-install
   REVISION: 1
   TEST SUITE: None
   HOOKS:
   MANIFEST:
   ---
   # Source: celeborn/templates/configmap.yaml
   #
   # Licensed to the Apache Software Foundation (ASF) under one or more
   # contributor license agreements.  See the NOTICE file distributed with
   # this work for additional information regarding copyright ownership.
   # The ASF licenses this file to You under the Apache License, Version 2.0
   # (the "License"); you may not use this file except in compliance with
   # the License.  You may obtain a copy of the License at
   #
   #    http://www.apache.org/licenses/LICENSE-2.0
   #
   # Unless required by applicable law or agreed to in writing, software
   # distributed under the License is distributed on an "AS IS" BASIS,
   # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   # See the License for the specific language governing permissions and
   # limitations under the License.
   #
   
   apiVersion: v1
   kind: ConfigMap
   metadata:
     name: celeborn-beta-conf
     labels:
       helm.sh/chart: celeborn-0.1.0
       app.kubernetes.io/name: celeborn
       app.kubernetes.io/instance: celeborn-beta
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/managed-by: Helm
       helm.sh/chart: celeborn-0.1.0
       app.kubernetes.io/instance: celeborn-beta
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/managed-by: Helm
   data:
     celeborn-defaults.conf: |-
       
celeborn.master.endpoints=celeborn-beta-master-0.celeborn-beta-master-svc.default.svc.cluster.local,celeborn-beta-master-1.celeborn-beta-master-svc.default.svc.cluster.local,celeborn-beta-master-2.celeborn-beta-master-svc.default.svc.cluster.local,
       
celeborn.master.ha.node.0.host=celeborn-beta-master-0.celeborn-beta-master-svc.default.svc.cluster.local
       
celeborn.master.ha.node.1.host=celeborn-beta-master-1.celeborn-beta-master-svc.default.svc.cluster.local
       
celeborn.master.ha.node.2.host=celeborn-beta-master-2.celeborn-beta-master-svc.default.svc.cluster.local
       celeborn.master.ha.ratis.raft.server.storage.dir=/mnt/celeborn_ratis
       
celeborn.worker.storage.dirs=/mnt/disk1:disktype=HDD,/mnt/disk2:disktype=HDD,/mnt/disk3:disktype=HDD,/mnt/disk4:disktype=HDD
       celeborn.application.heartbeat.timeout=120s
       celeborn.master.ha.enabled=true
       celeborn.master.metrics.prometheus.port=9098
       celeborn.metrics.enabled=true
       celeborn.push.stageEnd.timeout=120s
       celeborn.rpc.dispatcher.numThreads=4
       celeborn.rpc.io.clientThreads=64
       celeborn.rpc.io.numConnectionsPerPeer=2
       celeborn.rpc.io.serverThreads=64
       celeborn.shuffle.chunk.size=8m
       celeborn.worker.fetch.io.threads=32
       celeborn.worker.flusher.buffer.size=256K
       celeborn.worker.heartbeat.timeout=120s
       celeborn.worker.metrics.prometheus.port=9096
       celeborn.worker.monitor.disk.enabled=false
       celeborn.worker.push.io.threads=32
   
     celeborn-env.sh: |
       CELEBORN_MASTER_JAVA_OPTS="-XX:-PrintGC -XX:+PrintGCDetails 
-XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xloggc:gc-master.out 
-Dio.netty.leakDetectionLevel=advanced"
       CELEBORN_MASTER_MEMORY="2g"
       CELEBORN_NO_DAEMONIZE="1"
       CELEBORN_WORKER_JAVA_OPTS="-XX:-PrintGC -XX:+PrintGCDetails 
-XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xloggc:gc-worker.out 
-Dio.netty.leakDetectionLevel=advanced"
       CELEBORN_WORKER_MEMORY="2g"
       CELEBORN_WORKER_OFFHEAP_MEMORY="12g"
       TZ="Asia/Shanghai" 
   
     log4j2.xml: |-
       <?xml version="1.0" encoding="UTF-8"?>
       <!--
       ~ Licensed to the Apache Software Foundation (ASF) under one or more
       ~ contributor license agreements.  See the NOTICE file distributed with
       ~ this work for additional information regarding copyright ownership.
       ~ The ASF licenses this file to You under the Apache License, Version 2.0
       ~ (the "License"); you may not use this file except in compliance with
       ~ the License.  You may obtain a copy of the License at
       ~
       ~     http://www.apache.org/licenses/LICENSE-2.0
       ~
       ~ Unless required by applicable law or agreed to in writing, software
       ~ distributed under the License is distributed on an "AS IS" BASIS,
       ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied.
       ~ See the License for the specific language governing permissions and
       ~ limitations under the License.
       -->
       <!--
       ~ Extra logging related to initialization of Log4j.
       ~ Set to debug or trace if log4j initialization is failing.
       -->
       <Configuration status="INFO">
           <Appenders>
               <Console name="stdout" target="SYSTEM_OUT">
                   <!--
                     ~ In the pattern layout configuration below, we specify an 
explicit `%ex` conversion
                     ~ pattern for logging Throwables. If this was omitted, 
then (by default) Log4J would
                     ~ implicitly add an `%xEx` conversion pattern which logs 
stacktraces with additional
                     ~ class packaging information. That extra information can 
sometimes add a substantial
                     ~ performance overhead, so we disable it in our default 
logging config.
                     -->
                   <PatternLayout pattern="%d{yy/MM/dd HH:mm:ss,SSS} %p [%t] 
%c{1}: %m%n%ex"/>
               </Console>
               <RollingRandomAccessFile name="file" 
fileName="${env:CELEBORN_LOG_DIR}/celeborn.log"
                                        
filePattern="${env:CELEBORN_LOG_DIR}/celeborn.log.%d-%i">
                   <PatternLayout pattern="%d{yy/MM/dd HH:mm:ss,SSS} %p [%t] 
%c{1}: %m%n%ex"/>
                   <Policies>
                       <SizeBasedTriggeringPolicy size="200 MB"/>
                   </Policies>
                   <DefaultRolloverStrategy max="7">
                       <Delete basePath="${env:CELEBORN_LOG_DIR}" maxDepth="1">
                           <IfFileName glob="celeborn.log*">
                               <IfAny>
                                   <IfAccumulatedFileSize exceeds="1 GB" />
                                   <IfAccumulatedFileCount exceeds="10" />
                               </IfAny>
                           </IfFileName>
                       </Delete>
                   </DefaultRolloverStrategy>
               </RollingRandomAccessFile>
           </Appenders>
   
           <Loggers>
               <Root level="INFO">
                   <AppenderRef ref="stdout"/>
                   <AppenderRef ref="file"/>
               </Root>
               <Logger name="org.apache.hadoop.hdfs" level="WARN" 
additivity="false">
                   <Appender-ref ref="stdout" level="WARN" />
                   <Appender-ref ref="file" level="WARN"/>
               </Logger>
           </Loggers>
       </Configuration>
   
     metrics.properties: >-
       
*.sink.prometheusServlet.class=org.apache.celeborn.common.metrics.sink.PrometheusServlet
   ---
   # Source: celeborn/templates/master-service.yaml
   #
   # Licensed to the Apache Software Foundation (ASF) under one or more
   # contributor license agreements.  See the NOTICE file distributed with
   # this work for additional information regarding copyright ownership.
   # The ASF licenses this file to You under the Apache License, Version 2.0
   # (the "License"); you may not use this file except in compliance with
   # the License.  You may obtain a copy of the License at
   #
   #    http://www.apache.org/licenses/LICENSE-2.0
   #
   # Unless required by applicable law or agreed to in writing, software
   # distributed under the License is distributed on an "AS IS" BASIS,
   # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   # See the License for the specific language governing permissions and
   # limitations under the License.
   #
   
   apiVersion: v1
   kind: Service
   metadata:
     name: celeborn-beta-master-svc
     labels:
       helm.sh/chart: celeborn-0.1.0
       app.kubernetes.io/name: celeborn
       app.kubernetes.io/instance: celeborn-beta
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/managed-by: Helm
       helm.sh/chart: celeborn-0.1.0
       app.kubernetes.io/instance: celeborn-beta
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/managed-by: Helm
   spec:
     type: ClusterIP
     ports:
       - port: 9097
         targetPort: 9097
         protocol: TCP
         name: celeborn-master
     clusterIP: None
     selector:
       app.kubernetes.io/name: celeborn
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/role: master
       app.kubernetes.io/instance: celeborn-beta
   ---
   # Source: celeborn/templates/worker-service.yaml
   #
   # Licensed to the Apache Software Foundation (ASF) under one or more
   # contributor license agreements.  See the NOTICE file distributed with
   # this work for additional information regarding copyright ownership.
   # The ASF licenses this file to You under the Apache License, Version 2.0
   # (the "License"); you may not use this file except in compliance with
   # the License.  You may obtain a copy of the License at
   #
   #    http://www.apache.org/licenses/LICENSE-2.0
   #
   # Unless required by applicable law or agreed to in writing, software
   # distributed under the License is distributed on an "AS IS" BASIS,
   # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   # See the License for the specific language governing permissions and
   # limitations under the License.
   #
   
   apiVersion: v1
   kind: Service
   metadata:
     name: celeborn-beta-worker-svc
     labels:
       helm.sh/chart: celeborn-0.1.0
       app.kubernetes.io/name: celeborn
       app.kubernetes.io/instance: celeborn-beta
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/managed-by: Helm
       helm.sh/chart: celeborn-0.1.0
       app.kubernetes.io/instance: celeborn-beta
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/managed-by: Helm
   spec:
     type: ClusterIP
     clusterIP: None
     selector:
       app.kubernetes.io/name: celeborn
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/role: worker
       app.kubernetes.io/instance: celeborn-beta
   ---
   # Source: celeborn/templates/master-statefulset.yaml
   #
   # Licensed to the Apache Software Foundation (ASF) under one or more
   # contributor license agreements.  See the NOTICE file distributed with
   # this work for additional information regarding copyright ownership.
   # The ASF licenses this file to You under the Apache License, Version 2.0
   # (the "License"); you may not use this file except in compliance with
   # the License.  You may obtain a copy of the License at
   #
   #    http://www.apache.org/licenses/LICENSE-2.0
   #
   # Unless required by applicable law or agreed to in writing, software
   # distributed under the License is distributed on an "AS IS" BASIS,
   # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   # See the License for the specific language governing permissions and
   # limitations under the License.
   #
   
   apiVersion: apps/v1
   kind: StatefulSet
   metadata:
     name: celeborn-beta-master
     labels:
       helm.sh/chart: celeborn-0.1.0
       app.kubernetes.io/name: celeborn
       app.kubernetes.io/instance: celeborn-beta
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/managed-by: Helm
       app.kubernetes.io/role: master
       helm.sh/chart: celeborn-0.1.0
       app.kubernetes.io/instance: celeborn-beta
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/managed-by: Helm
   spec:
     selector:
       matchLabels:
         app.kubernetes.io/name: celeborn
         app.kubernetes.io/instance: celeborn-beta
         app.kubernetes.io/version: "0.2.1"
         app.kubernetes.io/role: master
         app.kubernetes.io/instance: celeborn-beta
     serviceName: celeborn-beta-master-svc
     replicas: 3
     template:
       metadata:
         labels:
           app.kubernetes.io/name: celeborn
           app.kubernetes.io/instance: celeborn-beta
           app.kubernetes.io/version: "0.2.1"
           app.kubernetes.io/role: master
           app.kubernetes.io/tag: 0.1.1-6badd20
           app.kubernetes.io/instance: celeborn-beta
       spec:
         affinity:
           podAntiAffinity:
             requiredDuringSchedulingIgnoredDuringExecution:
             - labelSelector:
                 matchExpressions:
                 - key: app.kubernetes.io/name
                   operator: In
                   values:
                   - celeborn
                 - key: app.kubernetes.io/role
                   operator: In
                   values:
                   - master
               topologyKey: kubernetes.io/hostname
         securityContext:
           fsGroup: 10006
           runAsGroup: 10006
           runAsUser: 10006
         hostNetwork: false
         dnsPolicy: ClusterFirst
         initContainers:
         - name: chown-celeborn-master-volume
           image: alpine:3.18
           imagePullPolicy: Always
           securityContext:
             runAsUser: 0
           command:
           - chown
           - 10006:10006
           - /mnt/celeborn_ratis
           volumeMounts:
             - name: celeborn-master-vol-0
               mountPath: /mnt/celeborn_ratis
         containers:
         - name: celeborn
           image: "aliyunemr/remote-shuffle-service:0.1.1-6badd20"
           imagePullPolicy: Always
           command:
             - "/usr/bin/tini"
             - "--"
             - "/bin/sh"
             - '-c'
             - "until nslookup 
celeborn-beta-master-0.celeborn-beta-master-svc.default.svc.cluster.local && 
nslookup 
celeborn-beta-master-1.celeborn-beta-master-svc.default.svc.cluster.local && 
nslookup 
celeborn-beta-master-2.celeborn-beta-master-svc.default.svc.cluster.local && 
true; do echo waiting for master; sleep 2; done && exec 
/opt/celeborn/sbin/start-master.sh"
           resources:
               null
           ports:
             - containerPort: 9097
             - containerPort: 9098
               name: metrics
               protocol: TCP
           volumeMounts:
             - mountPath: /opt/celeborn/conf
               name: celeborn-beta-volume
               readOnly: true
             - name: celeborn-master-vol-0
               mountPath: /mnt/celeborn_ratis
           env:
             - name: CELEBORN_MASTER_JAVA_OPTS
               value: "-XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps 
-XX:+PrintGCDateStamps -Xloggc:gc-master.out 
-Dio.netty.leakDetectionLevel=advanced"
             - name: CELEBORN_MASTER_MEMORY
               value: "2g"
             - name: CELEBORN_NO_DAEMONIZE
               value: "1"
             - name: CELEBORN_WORKER_JAVA_OPTS
               value: "-XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps 
-XX:+PrintGCDateStamps -Xloggc:gc-worker.out 
-Dio.netty.leakDetectionLevel=advanced"
             - name: CELEBORN_WORKER_MEMORY
               value: "2g"
             - name: CELEBORN_WORKER_OFFHEAP_MEMORY
               value: "12g"
             - name: TZ
               value: "Asia/Shanghai"
         terminationGracePeriodSeconds: 30 
         volumes:
           - configMap:
               name: celeborn-beta-conf
             name: celeborn-beta-volume
           - name: celeborn-master-vol-0
             hostPath:
               path: /mnt/celeborn_ratis/master
               type: DirectoryOrCreate
   ---
   # Source: celeborn/templates/worker-statefulset.yaml
   #
   # Licensed to the Apache Software Foundation (ASF) under one or more
   # contributor license agreements.  See the NOTICE file distributed with
   # this work for additional information regarding copyright ownership.
   # The ASF licenses this file to You under the Apache License, Version 2.0
   # (the "License"); you may not use this file except in compliance with
   # the License.  You may obtain a copy of the License at
   #
   #    http://www.apache.org/licenses/LICENSE-2.0
   #
   # Unless required by applicable law or agreed to in writing, software
   # distributed under the License is distributed on an "AS IS" BASIS,
   # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   # See the License for the specific language governing permissions and
   # limitations under the License.
   #
   
   apiVersion: apps/v1
   kind: StatefulSet
   metadata:
     name: celeborn-beta-worker
     labels:
       helm.sh/chart: celeborn-0.1.0
       app.kubernetes.io/name: celeborn
       app.kubernetes.io/instance: celeborn-beta
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/managed-by: Helm
       app.kubernetes.io/role: worker
       helm.sh/chart: celeborn-0.1.0
       app.kubernetes.io/instance: celeborn-beta
       app.kubernetes.io/version: "0.2.1"
       app.kubernetes.io/managed-by: Helm
   spec:
     selector:
       matchLabels:
         app.kubernetes.io/name: celeborn
         app.kubernetes.io/instance: celeborn-beta
         app.kubernetes.io/version: "0.2.1"
         app.kubernetes.io/role: worker
         app.kubernetes.io/instance: celeborn-beta
     serviceName: celeborn-beta-worker
     replicas: 5
     template:
       metadata:
         labels:
           app.kubernetes.io/name: celeborn
           app.kubernetes.io/instance: celeborn-beta
           app.kubernetes.io/version: "0.2.1"
           app.kubernetes.io/role: worker
           app.kubernetes.io/tag: 0.1.1-6badd20
           app.kubernetes.io/instance: celeborn-beta
       spec:
         affinity:
           podAntiAffinity:
             requiredDuringSchedulingIgnoredDuringExecution:
             - labelSelector:
                 matchExpressions:
                 - key: app.kubernetes.io/name
                   operator: In
                   values:
                   - celeborn
                 - key: app.kubernetes.io/role
                   operator: In
                   values:
                   - worker
               topologyKey: kubernetes.io/hostname
         securityContext:
           fsGroup: 10006
           runAsGroup: 10006
           runAsUser: 10006
         hostNetwork: false
         dnsPolicy: ClusterFirst
         initContainers:
         - name: chown-celeborn-worker-volume
           image: alpine:3.18
           imagePullPolicy: Always
           securityContext:
             runAsUser: 0
           command:
           - chown
           - 10006:10006
           - /mnt/disk1
           - /mnt/disk2
           - /mnt/disk3
           - /mnt/disk4
           volumeMounts:
           - name: celeborn-worker-vol-0
             mountPath: /mnt/disk1
           - name: celeborn-worker-vol-1
             mountPath: /mnt/disk2
           - name: celeborn-worker-vol-2
             mountPath: /mnt/disk3
           - name: celeborn-worker-vol-3
             mountPath: /mnt/disk4
         containers:
         - name: celeborn
           image: "aliyunemr/remote-shuffle-service:0.1.1-6badd20"
           imagePullPolicy: Always
           command:
             - "/usr/bin/tini"
             - "--"
             - "/bin/sh"
             - '-c'
             - "until nslookup 
celeborn-beta-master-0.celeborn-beta-master-svc.default.svc.cluster.local && 
nslookup 
celeborn-beta-master-1.celeborn-beta-master-svc.default.svc.cluster.local && 
nslookup 
celeborn-beta-master-2.celeborn-beta-master-svc.default.svc.cluster.local && 
true; do echo waiting for master; sleep 2; done && exec 
/opt/celeborn/sbin/start-worker.sh"
           resources:
               null
           ports:
             - containerPort: 9096
               name: metrics
               protocol: TCP
           volumeMounts:
             - name: celeborn-worker-vol-0
               mountPath: /mnt/disk1
             - name: celeborn-worker-vol-1
               mountPath: /mnt/disk2
             - name: celeborn-worker-vol-2
               mountPath: /mnt/disk3
             - name: celeborn-worker-vol-3
               mountPath: /mnt/disk4
           env:
             - name: CELEBORN_MASTER_JAVA_OPTS
               value: "-XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps 
-XX:+PrintGCDateStamps -Xloggc:gc-master.out 
-Dio.netty.leakDetectionLevel=advanced"
             - name: CELEBORN_MASTER_MEMORY
               value: "2g"
             - name: CELEBORN_NO_DAEMONIZE
               value: "1"
             - name: CELEBORN_WORKER_JAVA_OPTS
               value: "-XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps 
-XX:+PrintGCDateStamps -Xloggc:gc-worker.out 
-Dio.netty.leakDetectionLevel=advanced"
             - name: CELEBORN_WORKER_MEMORY
               value: "2g"
             - name: CELEBORN_WORKER_OFFHEAP_MEMORY
               value: "12g"
             - name: TZ
               value: "Asia/Shanghai"
         terminationGracePeriodSeconds: 30
         volumes:
           - configMap:
               name: celeborn-beta-conf
             name: celeborn-beta-volume
           - name: celeborn-worker-vol-0
             hostPath:
               path: /mnt/disk1/worker
               type: DirectoryOrCreate
           - name: celeborn-worker-vol-1
             hostPath:
               path: /mnt/disk2/worker
               type: DirectoryOrCreate
           - name: celeborn-worker-vol-2
             hostPath:
               path: /mnt/disk3/worker
               type: DirectoryOrCreate
           - name: celeborn-worker-vol-3
             hostPath:
               path: /mnt/disk4/worker
               type: DirectoryOrCreate
   ---
   # Source: celeborn/templates/prometheus-podmonitor.yaml
   #
   # Licensed to the Apache Software Foundation (ASF) under one or more
   # contributor license agreements.  See the NOTICE file distributed with
   # this work for additional information regarding copyright ownership.
   # The ASF licenses this file to You under the Apache License, Version 2.0
   # (the "License"); you may not use this file except in compliance with
   # the License.  You may obtain a copy of the License at
   #
   #    http://www.apache.org/licenses/LICENSE-2.0
   #
   # Unless required by applicable law or agreed to in writing, software
   # distributed under the License is distributed on an "AS IS" BASIS,
   # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   # See the License for the specific language governing permissions and
   # limitations under the License.
   #
   
   NOTES:
   #
   # Licensed to the Apache Software Foundation (ASF) under one or more
   # contributor license agreements.  See the NOTICE file distributed with
   # this work for additional information regarding copyright ownership.
   # The ASF licenses this file to You under the Apache License, Version 2.0
   # (the "License"); you may not use this file except in compliance with
   # the License.  You may obtain a copy of the License at
   #
   #    http://www.apache.org/licenses/LICENSE-2.0
   #
   # Unless required by applicable law or agreed to in writing, software
   # distributed under the License is distributed on an "AS IS" BASIS,
   # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   # See the License for the specific language governing permissions and
   # limitations under the License.
   #
   
   Celeborn
   ```
   
   As for user guide doc, i'd make following PR to update `Deploy Celeborn On 
K8s`
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to