This is an automated email from the ASF dual-hosted git repository.

chengpan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn-website.git

commit 1f12bfb0312c57c2b1f3d20e051d95fc79a3101a
Author: Cheng Pan <[email protected]>
AuthorDate: Sun Nov 27 22:12:20 2022 +0800

    Init website
---
 .asf.yaml                              |  33 ++++++
 .github/workflows/site.yaml            |  33 ++++++
 .gitignore                             |  18 +++
 LICENSE                                | 201 +++++++++++++++++++++++++++++++++
 README.md                              |   2 +
 docs/configuration/client.md           |  65 +++++++++++
 docs/configuration/columnar-shuffle.md |  28 +++++
 docs/configuration/index.md            | 153 +++++++++++++++++++++++++
 docs/configuration/master.md           |  49 ++++++++
 docs/configuration/metrics.md          |  31 +++++
 docs/configuration/network.md          |  44 ++++++++
 docs/configuration/quota.md            |  26 +++++
 docs/configuration/worker.md           |  83 ++++++++++++++
 docs/contrib/docs_and_website.md       |  66 +++++++++++
 docs/index.md                          |  27 +++++
 docs/storage-usage-quota.md            |  79 +++++++++++++
 mkdocs.yml                             |  56 +++++++++
 requirements.txt                       |  40 +++++++
 18 files changed, 1034 insertions(+)

diff --git a/.asf.yaml b/.asf.yaml
new file mode 100644
index 0000000..8a5c358
--- /dev/null
+++ b/.asf.yaml
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+github:
+  description: "Apache Celeborn Site"
+  homepage: https://celeborn.apache.org/
+  labels:
+    - celeborn
+  enabled_merge_buttons:
+    squash: true
+    merge: false
+    rebase: false
+notifications:
+  commits: [email protected]
+  issues: [email protected]
+  pullrequests: [email protected]
+staging:
+  profile: ~
+  whoami: asf-site
+publish:
+  whoami: asf-site
\ No newline at end of file
diff --git a/.github/workflows/site.yaml b/.github/workflows/site.yaml
new file mode 100644
index 0000000..eaac561
--- /dev/null
+++ b/.github/workflows/site.yaml
@@ -0,0 +1,33 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+name: "Deploy Website"
+on:
+  push:
+    branches:
+      - "main"
+
+jobs:
+  run-tests:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.x
+      - run: pip install -r requirements.txt
+      - run: mkdocs gh-deploy --force --remote-branch asf-site
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..93cf26c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,18 @@
+*#*#
+*.#*
+*.iml
+*.ipr
+*.iws
+*.pyc
+*.pyo
+*.swp
+*~
+.DS_Store
+.cache
+.classpath
+.idea/
+.idea_modules/
+.project
+.pydevproject
+.python-version
+.settings
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..77f3d78
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
+Apache Celeborn(Incubating) Website
+===
diff --git a/docs/configuration/client.md b/docs/configuration/client.md
new file mode 100644
index 0000000..9a5127f
--- /dev/null
+++ b/docs/configuration/client.md
@@ -0,0 +1,65 @@
+---
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+      https://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+<!--begin-include-->
+| Key | Default | Description | Since |
+| --- | ------- | ----------- | ----- |
+| celeborn.application.heartbeatInterval | 10s | Interval for client to send 
heartbeat message to master. | 0.2.0 | 
+| celeborn.client.maxRetries | 15 | Max retry times for client to connect 
master endpoint | 0.2.0 | 
+| celeborn.client.rpc.askTimeout | &lt;value of celeborn.network.timeout&gt; | 
Timeout for client RPC ask operations. | 0.2.0 | 
+| celeborn.fetch.maxReqsInFlight | 3 | Amount of in-flight chunk fetch 
request. | 0.2.0 | 
+| celeborn.fetch.timeout | 120s | Timeout for a task to fetch chunk. | 0.2.0 | 
+| celeborn.master.endpoints | &lt;localhost&gt;:9097 | Endpoints of master 
nodes for celeborn client to connect, allowed pattern is: 
`<host1>:<port1>[,<host2>:<port2>]*`, e.g. `clb1:9097,clb2:9098,clb3:9099`. If 
the port is omitted, 9097 will be used. | 0.2.0 | 
+| celeborn.push.buffer.initial.size | 8k |  | 0.2.0 | 
+| celeborn.push.buffer.max.size | 64k | Max size of reducer partition buffer 
memory for shuffle hash writer. The pushed data will be buffered in memory 
before sending to Celeborn worker. For performance consideration keep this 
buffer size higher than 32K. Example: If reducer amount is 2000, buffer size is 
64K, then each task will consume up to `64KiB * 2000 = 125MiB` heap memory. | 
0.2.0 | 
+| celeborn.push.limit.inFlight.sleepInterval | 50ms | Sleep interval when 
check netty in-flight requests to be done. | 0.2.0 | 
+| celeborn.push.limit.inFlight.timeout | 240s | Timeout for netty in-flight 
requests to be done. | 0.2.0 | 
+| celeborn.push.maxReqsInFlight | 32 | Amount of Netty in-flight requests. The 
maximum memory is `celeborn.push.maxReqsInFlight` * 
`celeborn.push.buffer.max.size` * compression ratio(1 in worst case), default: 
64Kib * 32 = 2Mib | 0.2.0 | 
+| celeborn.push.queue.capacity | 512 | Push buffer queue size for a task. The 
maximum memory is `celeborn.push.buffer.max.size` * 
`celeborn.push.queue.capacity`, default: 64KiB * 512 = 32MiB | 0.2.0 | 
+| celeborn.push.replicate.enabled | true | When true, Celeborn worker will 
replicate shuffle data to another Celeborn worker asynchronously to ensure the 
pushed shuffle data won't be lost after the node failure. | 0.2.0 | 
+| celeborn.push.retry.threads | 8 | Thread number to process shuffle re-send 
push data requests. | 0.2.0 | 
+| celeborn.push.sortMemory.threshold | 64m | When SortBasedPusher use memory 
over the threshold, will trigger push data. | 0.2.0 | 
+| celeborn.push.splitPartition.threads | 8 | Thread number to process shuffle 
split request in shuffle client. | 0.2.0 | 
+| celeborn.push.stageEnd.timeout | 240s | Timeout for StageEnd. | 0.2.0 | 
+| celeborn.rpc.cache.concurrencyLevel | 32 | The number of write locks to 
update rpc cache. | 0.2.0 | 
+| celeborn.rpc.cache.expireTime | 15s | The time before a cache item is 
removed. | 0.2.0 | 
+| celeborn.rpc.cache.size | 256 | The max cache items count for rpc cache. | 
0.2.0 | 
+| celeborn.rpc.maxParallelism | 1024 | Max parallelism of client on sending 
RPC requests. | 0.2.0 | 
+| celeborn.shuffle.batchHandleChangePartition.enabled | false | When true, 
LifecycleManager will handle change partition request in batch. Otherwise, 
LifecycleManager will process the requests one by one | 0.2.0 | 
+| celeborn.shuffle.batchHandleChangePartition.interval | 100ms | Interval for 
LifecycleManager to schedule handling change partition requests in batch. | 
0.2.0 | 
+| celeborn.shuffle.batchHandleChangePartition.threads | 8 | Threads number for 
LifecycleManager to handle change partition request in batch. | 0.2.0 | 
+| celeborn.shuffle.chuck.size | 8m | Max chunk size of reducer's merged 
shuffle data. For example, if a reducer's shuffle data is 128M and the data 
will need 16 fetch chunk requests to fetch. | 0.2.0 | 
+| celeborn.shuffle.compression.codec | LZ4 | The codec used to compress 
shuffle data. By default, Celeborn provides two codecs: `lz4` and `zstd`. | 
0.2.0 | 
+| celeborn.shuffle.compression.zstd.level | 1 | Compression level for Zstd 
compression codec, its value should be an integer between -5 and 22. Increasing 
the compression level will result in better compression at the expense of more 
CPU and memory. | 0.2.0 | 
+| celeborn.shuffle.expired.checkInterval | 60s | Interval for client to check 
expired shuffles. | 0.2.0 | 
+| celeborn.shuffle.forceFallback.enabled | false | Whether force fallback 
shuffle to Spark's default. | 0.2.0 | 
+| celeborn.shuffle.forceFallback.numPartitionsThreshold | 500000 | Celeborn 
will only accept shuffle of partition number lower than this configuration 
value. | 0.2.0 | 
+| celeborn.shuffle.manager.port | 0 | Port used by the LifecycleManager on the 
Driver. | 0.2.0 | 
+| celeborn.shuffle.partition.type | REDUCE | Type of shuffle's partition. | 
0.2.0 | 
+| celeborn.shuffle.partitionSplit.mode | SOFT | soft: the shuffle file size 
might be larger than split threshold. hard: the shuffle file size will be 
limited to split threshold. | 0.2.0 | 
+| celeborn.shuffle.partitionSplit.threshold | 1G | Shuffle file size 
threshold, if file size exceeds this, trigger split. | 0.2.0 | 
+| celeborn.shuffle.rangeReadFilter.enabled | false | If a spark application 
have skewed partition, this value can set to true to improve performance. | 
0.2.0 | 
+| celeborn.shuffle.register.maxRetries | 3 | Max retry times for client to 
register shuffle. | 0.2.0 | 
+| celeborn.shuffle.register.retryWait | 3s | Wait time before next retry if 
register shuffle failed. | 0.2.0 | 
+| celeborn.shuffle.writer | HASH | Celeborn supports the following kind of 
shuffle writers. 1. hash: hash-based shuffle writer works fine when shuffle 
partition count is normal; 2. sort: sort-based shuffle writer works fine when 
memory pressure is high or shuffle partition count it huge. | 0.2.0 | 
+| celeborn.slots.reserve.maxRetries | 3 | Max retry times for client to 
reserve slots. | 0.2.0 | 
+| celeborn.slots.reserve.retryWait | 3s | Wait time before next retry if 
reserve slots failed. | 0.2.0 | 
+| celeborn.storage.hdfs.dir | &lt;undefined&gt; | HDFS dir configuration for 
Celeborn to access HDFS. | 0.2.0 | 
+| celeborn.worker.excluded.checkInterval | 30s | Interval for client to 
refresh excluded worker list. | 0.2.0 | 
+| celeborn.worker.excluded.expireTimeout | 600s | Timeout time for 
LifecycleManager to clear reserved excluded worker. | 0.2.0 | 
+<!--end-include-->
diff --git a/docs/configuration/columnar-shuffle.md 
b/docs/configuration/columnar-shuffle.md
new file mode 100644
index 0000000..f27d0f5
--- /dev/null
+++ b/docs/configuration/columnar-shuffle.md
@@ -0,0 +1,28 @@
+---
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+      https://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+<!--begin-include-->
+| Key | Default | Description | Since |
+| --- | ------- | ----------- | ----- |
+| celeborn.columnar.offHeap.enabled | false | Whether to use off heap columnar 
vector. | 0.2.0 | 
+| celeborn.columnar.shuffle.batch.size | 10000 | Vector batch size for 
columnar shuffle. | 0.2.0 | 
+| celeborn.columnar.shuffle.codegen.enabled | false | Whether to use codegen 
for columnar-based shuffle. | 0.2.0 | 
+| celeborn.columnar.shuffle.enabled | false | Whether to enable columnar-based 
shuffle. | 0.2.0 | 
+| celeborn.columnar.shuffle.encoding.dictionary.enabled | false | Whether to 
use dictionary encoding for columnar-based shuffle data. | 0.2.0 | 
+| celeborn.columnar.shuffle.encoding.dictionary.maxFactor | 0.3 | Max factor 
for dictionary size. The max dictionary size is `min(32.0 KB, 
celeborn.columnar.shuffle.batch.size * 
celeborn.columnar.shuffle.encoding.dictionary.maxFactor)`. | 0.2.0 | 
+<!--end-include-->
diff --git a/docs/configuration/index.md b/docs/configuration/index.md
new file mode 100644
index 0000000..9a5042d
--- /dev/null
+++ b/docs/configuration/index.md
@@ -0,0 +1,153 @@
+---
+hide:
+  - navigation
+
+license: |
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  
+      https://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+Configuration Guide
+===
+This documentation contains Celeborn configuration details and a tuning guide.
+
+## Important Configurations
+
+### Environment Variables
+
+- `CELEBORN_WORKER_MEMORY=4g`
+- `CELEBORN_WORKER_OFFHEAP_MEMORY=24g`
+
+Celeborn workers tend to improve performance by using off-heap buffers.
+Off-heap memory requirement can be estimated as below:
+
+```
+numDirs = `celeborn.worker.storage.dirs`             # the amount of directory 
will be used by Celeborn storage
+bufferSize = `celeborn.worker.flusher.buffer.size`   # the amount of memory 
will be used by a single flush buffer 
+off-heap-memory = bufferSize * estimatedTasks * 2 + network memory
+```
+
+For example, if an Celeborn worker has 10 storage directories or disks and the 
buffer size is set to 256 KiB.
+The necessary off-heap memory is 10 GiB.
+
+Network memory will be consumed when netty reads from a TPC channel, there 
will need some extra
+memory. Empirically, Celeborn worker off-heap memory should be set to 
`(numDirs  * bufferSize * 1.2)`.
+
+## All Configurations
+
+### Client
+
+{!
+include-markdown "./client.md"
+start="<!--begin-include-->"
+end="<!--end-include-->"
+!}
+
+### Columnar Shuffle
+
+{!
+include-markdown "./columnar-shuffle.md"
+start="<!--begin-include-->"
+end="<!--end-include-->"
+!}
+
+### Master
+
+{!
+include-markdown "./master.md"
+start="<!--begin-include-->"
+end="<!--end-include-->"
+!}
+
+### Worker
+
+{!
+include-markdown "./worker.md"
+start="<!--begin-include-->"
+end="<!--end-include-->"
+!}
+
+### Quota
+
+{!
+include-markdown "./quota.md"
+start="<!--begin-include-->"
+end="<!--end-include-->"
+!}
+
+### Network
+
+{!
+include-markdown "./network.md"
+start="<!--begin-include-->"
+end="<!--end-include-->"
+!}
+
+### Metrics
+
+{!
+include-markdown "./metrics.md"
+start="<!--begin-include-->"
+end="<!--end-include-->"
+!}
+
+#### metrics.properties
+
+```properties
+*.sink.csv.class=org.apache.celeborn.common.metrics.sink.CsvSink
+*.sink.prometheusServlet.class=org.apache.celeborn.common.metrics.sink.PrometheusServlet
+```
+
+### Environment Variables
+
+Recommend configuring in `conf/celeborn-env.sh`.
+
+| Key                              | Default                                   
      | Description |
+|----------------------------------|-------------------------------------------------|-------------|
+| `CELEBORN_HOME`                  | ``$(cd "`dirname "$0"`"/..; pwd)``        
      |             |
+| `CELEBORN_CONF_DIR`              | 
`${CELEBORN_CONF_DIR:-"${CELEBORN_HOME}/conf"}` |             |
+| `CELEBORN_MASTER_MEMORY`         | 1 GB                                      
      |             |
+| `CELEBORN_WORKER_MEMORY`         | 1 GB                                      
      |             |
+| `CELEBORN_WORKER_OFFHEAP_MEMORY` | 1 GB                                      
      |             |
+| `CELEBORN_MASTER_JAVA_OPTS`      |                                           
      |             |
+| `CELEBORN_WORKER_JAVA_OPTS`      |                                           
      |             |
+| `CELEBORN_PID_DIR`               | `${CELEBORN_HOME}/pids`                   
      |             |
+| `CELEBORN_LOG_DIR`               | `${CELEBORN_HOME}/logs`                   
      |             |
+| `CELEBORN_SSH_OPTS`              | `-o StrictHostKeyChecking=no`             
      |             |
+| `CELEBORN_SLEEP`                 |                                           
      |             |
+
+## Tuning
+
+Assume we have a cluster described as below:
+5 Celeborn Workers with 20 GB off-heap memory and 10 disks.
+As we need to reserve 20% off-heap memory for netty,
+so we could assume 16 GB off-heap memory can be used for flush buffers.
+
+If `spark.celeborn.push.buffer.size` is 64 KB, we can have in-flight requests 
up to 1310720.
+If you have 8192 mapper tasks, you could set 
`spark.celeborn.push.maxReqsInFlight=160` to gain performance improvements.
+
+If `celeborn.worker.flush.buffer.size` is 256 KB, we can have total slots up 
to 327680 slots.
+
+## Worker Recover Status After Restart
+
+`ShuffleClient` records the shuffle partition location's host, service port, 
and filename,
+to support workers recovering reading existing shuffle data after worker 
restart,
+during worker shutdown, workers should store the meta about reading shuffle 
partition files in LevelDB,
+and restore the meta after restarting workers, also workers should keep a 
stable service port to support
+`ShuffleClient` retry reading data. Users should set 
`celeborn.worker.graceful.shutdown.enabled` to `true` and
+set below service port with stable port to support worker recover status.
+```
+rss.worker.rpc.port
+rss.fetchserver.port
+rss.pushserver.port
+rss.replicateserver.port
+```
diff --git a/docs/configuration/master.md b/docs/configuration/master.md
new file mode 100644
index 0000000..59adaa0
--- /dev/null
+++ b/docs/configuration/master.md
@@ -0,0 +1,49 @@
+---
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+      https://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+<!--begin-include-->
+| Key | Default | Description | Since |
+| --- | ------- | ----------- | ----- |
+| celeborn.application.heartbeat.timeout | 120s | Application heartbeat 
timeout. | 0.2.0 | 
+| celeborn.ha.enabled | false | When true, master nodes run as Raft cluster 
mode. | 0.2.0 | 
+| celeborn.ha.master.node.&lt;id&gt;.host | &lt;required&gt; | Host to bind of 
master node <id> in HA mode. | 0.2.0 | 
+| celeborn.ha.master.node.&lt;id&gt;.port | 9097 | Port to bind of master node 
<id> in HA mode. | 0.2.0 | 
+| celeborn.ha.master.node.&lt;id&gt;.ratis.port | 9872 | Ratis port to bind of 
master node <id> in HA mode. | 0.2.0 | 
+| celeborn.ha.master.ratis.raft.rpc.type | netty | RPC type for Ratis, 
available options: netty, grpc. | 0.2.0 | 
+| celeborn.ha.master.ratis.raft.server.storage.dir | /tmp/ratis |  | 0.2.0 | 
+| celeborn.master.host | &lt;localhost&gt; | Hostname for master to bind. | 
0.2.0 | 
+| celeborn.master.metrics.prometheus.host | 0.0.0.0 | Master's Prometheus 
host. | 0.2.0 | 
+| celeborn.master.metrics.prometheus.port | 9098 | Master's Prometheus port. | 
0.2.0 | 
+| celeborn.master.port | 9097 | Port for master to bind. | 0.2.0 | 
+| celeborn.metrics.app.topDiskUsage.count | 50 | Size for top items about top 
disk usage applications list. | 0.2.0 | 
+| celeborn.metrics.app.topDiskUsage.interval | 10min | Time length for a 
window about top disk usage application list. | 0.2.0 | 
+| celeborn.metrics.app.topDiskUsage.windowSize | 24 | Window size about top 
disk usage application list. | 0.2.0 | 
+| celeborn.metrics.capacity | 4096 | The maximum number of metrics which a 
source can use to generate output strings. | 0.2.0 | 
+| celeborn.metrics.collectPerfCritical.enabled | false | It controls whether 
to collect metrics which may affect performance. When enable, Celeborn collects 
them. | 0.2.0 | 
+| celeborn.metrics.enabled | true | When true, enable metrics system. | 0.2.0 
| 
+| celeborn.metrics.sample.rate | 1.0 | It controls if Celeborn collect timer 
metrics for some operations. Its value should be in [0.0, 1.0]. | 0.2.0 | 
+| celeborn.metrics.timer.slidingWindow.size | 4096 | The sliding window size 
of timer metric. | 0.2.0 | 
+| celeborn.shuffle.estimatedPartitionSize.update.initialDelay | 5min | Initial 
delay time before start updating partition size for estimation. | 0.2.0 | 
+| celeborn.shuffle.estimatedPartitionSize.update.interval | 10min | Interval 
of updating partition size for estimation. | 0.2.0 | 
+| celeborn.shuffle.initialEstimatedPartitionSize | 64mb | Initial partition 
size for estimation, it will change according to runtime stats. | 0.2.0 | 
+| celeborn.slots.assign.extraSlots | 2 | Extra slots number when master assign 
slots. | 0.2.0 | 
+| celeborn.slots.assign.loadAware.diskGroupGradient | 0.1 | This value means 
how many more workload will be placed into a faster disk group than a slower 
group. | 0.2.0 | 
+| celeborn.slots.assign.loadAware.numDiskGroups | 5 | This configuration is a 
guidance for load-aware slot allocation algorithm. This value is control how 
many disk groups will be created. | 0.2.0 | 
+| celeborn.slots.assign.policy | ROUNDROBIN | Policy for master to assign 
slots, Celeborn supports two types of policy: roundrobin and loadaware. | 0.2.0 
| 
+| celeborn.worker.heartbeat.timeout | 120s | Worker heartbeat timeout. | 0.2.0 
| 
+<!--end-include-->
diff --git a/docs/configuration/metrics.md b/docs/configuration/metrics.md
new file mode 100644
index 0000000..2644ea0
--- /dev/null
+++ b/docs/configuration/metrics.md
@@ -0,0 +1,31 @@
+---
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+      https://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+<!--begin-include-->
+| Key | Default | Description | Since |
+| --- | ------- | ----------- | ----- |
+| celeborn.master.metrics.prometheus.host | 0.0.0.0 | Master's Prometheus 
host. | 0.2.0 | 
+| celeborn.master.metrics.prometheus.port | 9098 | Master's Prometheus port. | 
0.2.0 | 
+| celeborn.metrics.capacity | 4096 | The maximum number of metrics which a 
source can use to generate output strings. | 0.2.0 | 
+| celeborn.metrics.collectPerfCritical.enabled | false | It controls whether 
to collect metrics which may affect performance. When enable, Celeborn collects 
them. | 0.2.0 | 
+| celeborn.metrics.enabled | true | When true, enable metrics system. | 0.2.0 
| 
+| celeborn.metrics.sample.rate | 1.0 | It controls if Celeborn collect timer 
metrics for some operations. Its value should be in [0.0, 1.0]. | 0.2.0 | 
+| celeborn.metrics.timer.slidingWindow.size | 4096 | The sliding window size 
of timer metric. | 0.2.0 | 
+| celeborn.worker.metrics.prometheus.host | 0.0.0.0 | Worker's Prometheus 
host. | 0.2.0 | 
+| celeborn.worker.metrics.prometheus.port | 9096 | Worker's Prometheus port. | 
0.2.0 | 
+<!--end-include-->
diff --git a/docs/configuration/network.md b/docs/configuration/network.md
new file mode 100644
index 0000000..8724de5
--- /dev/null
+++ b/docs/configuration/network.md
@@ -0,0 +1,44 @@
+---
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+      https://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+<!--begin-include-->
+| Key | Default | Description | Since |
+| --- | ------- | ----------- | ----- |
+| celeborn.&lt;module&gt;.decoder.mode | default | Netty TransportFrameDecoder 
implementation, available options: default, supplier. |  | 
+| celeborn.&lt;module&gt;.io.backLog | 0 | Requested maximum length of the 
queue of incoming connections. Default 0 for no backlog. |  | 
+| celeborn.&lt;module&gt;.io.clientThreads | 0 | Number of threads used in the 
client thread pool. Default to 0, which is 2x#cores. |  | 
+| celeborn.&lt;module&gt;.io.connectTimeout | &lt;value of 
celeborn.network.connect.timeout&gt; | Socket connect timeout. |  | 
+| celeborn.&lt;module&gt;.io.connectionTimeout | &lt;value of 
celeborn.network.timeout&gt; | Connection active timeout. |  | 
+| celeborn.&lt;module&gt;.io.enableVerboseMetrics | false | Whether to track 
Netty memory detailed metrics. If true, the detailed metrics of Netty 
PoolByteBufAllocator will be gotten, otherwise only general memory usage will 
be tracked. |  | 
+| celeborn.&lt;module&gt;.io.lazyFD | true | Whether to initialize 
FileDescriptor lazily or not. If true, file descriptors are created only when 
data is going to be transferred. This can reduce the number of open files. |  | 
+| celeborn.&lt;module&gt;.io.maxRetries | 3 | Max number of times we will try 
IO exceptions (such as connection timeouts) per request. If set to 0, we will 
not do any retries. |  | 
+| celeborn.&lt;module&gt;.io.mode | NIO | Netty EventLoopGroup backend, 
available options: NIO, EPOLL. |  | 
+| celeborn.&lt;module&gt;.io.numConnectionsPerPeer | 2 | Number of concurrent 
connections between two nodes. |  | 
+| celeborn.&lt;module&gt;.io.preferDirectBufs | true | If true, we will prefer 
allocating off-heap byte buffers within Netty. |  | 
+| celeborn.&lt;module&gt;.io.receiveBuffer | 0b | Receive buffer size 
(SO_RCVBUF). Note: the optimal size for receive buffer and send buffer should 
be latency * network_bandwidth. Assuming latency = 1ms, network_bandwidth = 
10Gbps buffer size should be ~ 1.25MB. | 0.2.0 | 
+| celeborn.&lt;module&gt;.io.retryWait | 5s | Time that we will wait in order 
to perform a retry after an IOException. Only relevant if maxIORetries > 0. | 
0.2.0 | 
+| celeborn.&lt;module&gt;.io.sendBuffer | 0b | Send buffer size (SO_SNDBUF). | 
0.2.0 | 
+| celeborn.&lt;module&gt;.io.serverThreads | 0 | Number of threads used in the 
server thread pool. Default to 0, which is 2x#cores. |  | 
+| celeborn.network.connect.timeout | 10s | Default socket connect timeout. | 
0.2.0 | 
+| celeborn.network.timeout | 240s | Default timeout for network operations. | 
0.2.0 | 
+| celeborn.port.maxRetries | 1 | When port is occupied, we will retry for max 
retry times. | 0.2.0 | 
+| celeborn.rpc.askTimeout | &lt;value of celeborn.network.timeout&gt; | 
Timeout for RPC ask operations. | 0.2.0 | 
+| celeborn.rpc.connect.threads | 64 |  | 0.2.0 | 
+| celeborn.rpc.lookupTimeout | 30s | Timeout for RPC lookup operations. | 
0.2.0 | 
+| celeborn.shuffle.maxChunksBeingTransferred | 9223372036854775807 | The max 
number of chunks allowed to be transferred at the same time on shuffle service. 
Note that new incoming connections will be closed when the max number is hit. 
The client will retry according to the shuffle retry configs (see 
`celeborn.shuffle.io.maxRetries` and `celeborn.shuffle.io.retryWait`), if those 
limits are reached the task will fail with fetch failure. | 0.2.0 | 
+<!--end-include-->
diff --git a/docs/configuration/quota.md b/docs/configuration/quota.md
new file mode 100644
index 0000000..7d124ce
--- /dev/null
+++ b/docs/configuration/quota.md
@@ -0,0 +1,26 @@
+---
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+      https://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+<!--begin-include-->
+| Key | Default | Description | Since |
+| --- | ------- | ----------- | ----- |
+| celeborn.quota.configuration.path | &lt;undefined&gt; | Quota configuration 
file path. | 0.2.0 | 
+| celeborn.quota.enabled | true | When true, before registering shuffle, 
LifecycleManager should check if current user have enough quota space, if 
cluster don't have enough quota space for current user, fallback to Spark's 
default shuffle | 0.2.0 | 
+| celeborn.quota.identity.provider | 
org.apache.celeborn.common.identity.DefaultIdentityProvider | IdentityProvider 
class name. Default class is 
`org.apache.celeborn.common.identity.DefaultIdentityProvider`, return 
`org.apache.celeborn.common.identity.UserIdentifier` with default tenant id and 
username from `org.apache.hadoop.security.UserGroupInformation`.  | 0.2.0 | 
+| celeborn.quota.manager | 
org.apache.celeborn.common.quota.DefaultQuotaManager | QuotaManger class name. 
Default class is `org.apache.celeborn.common.quota.DefaultQuotaManager`. | 
0.2.0 | 
+<!--end-include-->
diff --git a/docs/configuration/worker.md b/docs/configuration/worker.md
new file mode 100644
index 0000000..03881f9
--- /dev/null
+++ b/docs/configuration/worker.md
@@ -0,0 +1,83 @@
+---
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+      https://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+<!--begin-include-->
+| Key | Default | Description | Since |
+| --- | ------- | ----------- | ----- |
+| celeborn.client.maxRetries | 15 | Max retry times for client to connect 
master endpoint | 0.2.0 | 
+| celeborn.master.endpoints | &lt;localhost&gt;:9097 | Endpoints of master 
nodes for celeborn client to connect, allowed pattern is: 
`<host1>:<port1>[,<host2>:<port2>]*`, e.g. `clb1:9097,clb2:9098,clb3:9099`. If 
the port is omitted, 9097 will be used. | 0.2.0 | 
+| celeborn.metrics.capacity | 4096 | The maximum number of metrics which a 
source can use to generate output strings. | 0.2.0 | 
+| celeborn.metrics.collectPerfCritical.enabled | false | It controls whether 
to collect metrics which may affect performance. When enable, Celeborn collects 
them. | 0.2.0 | 
+| celeborn.metrics.enabled | true | When true, enable metrics system. | 0.2.0 
| 
+| celeborn.metrics.sample.rate | 1.0 | It controls if Celeborn collect timer 
metrics for some operations. Its value should be in [0.0, 1.0]. | 0.2.0 | 
+| celeborn.metrics.timer.slidingWindow.size | 4096 | The sliding window size 
of timer metric. | 0.2.0 | 
+| celeborn.shuffle.chuck.size | 8m | Max chunk size of reducer's merged 
shuffle data. For example, if a reducer's shuffle data is 128M and the data 
will need 16 fetch chunk requests to fetch. | 0.2.0 | 
+| celeborn.shuffle.minPartitionSizeToEstimate | 8mb | Ignore partition size 
smaller than this configuration of partition size for estimation. | 0.2.0 | 
+| celeborn.storage.hdfs.dir | &lt;undefined&gt; | HDFS dir configuration for 
Celeborn to access HDFS. | 0.2.0 | 
+| celeborn.worker.closeIdleConnections | false | Whether worker will close 
idle connections. | 0.2.0 | 
+| celeborn.worker.commit.threads | 32 | Thread number of worker to commit 
shuffle data files asynchronously. | 0.2.0 | 
+| celeborn.worker.directMemoryRatioToPauseReceive | 0.85 | If direct memory 
usage reaches this limit, the worker will stop to receive data from Celeborn 
shuffle clients. | 0.2.0 | 
+| celeborn.worker.directMemoryRatioToPauseReplicate | 0.95 | If direct memory 
usage reaches this limit, the worker will stop to receive replication data from 
other workers. | 0.2.0 | 
+| celeborn.worker.directMemoryRatioToResume | 0.5 | If direct memory usage is 
less than this limit, worker will resume. | 0.2.0 | 
+| celeborn.worker.disk.check.timeout | 30s | Timeout time for worker check 
device status. | 0.2.0 | 
+| celeborn.worker.disk.checkFileClean.maxRetries | 3 | The number of retries 
for a worker to check if the working directory is cleaned up before registering 
with the master. | 0.2.0 | 
+| celeborn.worker.disk.checkFileClean.timeout | 1000ms | The wait time per 
retry for a worker to check if the working directory is cleaned up before 
registering with the master. | 0.2.0 | 
+| celeborn.worker.disk.reserve.size | 5G | Celeborn worker reserved space for 
each disk. | 0.2.0 | 
+| celeborn.worker.fetch.io.threads | &lt;undefined&gt; | Netty IO thread 
number of worker to handle client fetch data. The default threads number is 
`size(celeborn.worker.storage.dirs)*2`. | 0.2.0 | 
+| celeborn.worker.fetch.port | 0 | Server port for Worker to receive fetch 
data request from ShuffleClient. | 0.2.0 | 
+| celeborn.worker.flusher.avgFlushTime.slidingWindow.size | 20 | The size of 
sliding windows used to calculate statistics about flushed time and count. | 
0.2.0 | 
+| celeborn.worker.flusher.buffer.size | 256k | Size of buffer used by a single 
flusher. | 0.2.0 | 
+| celeborn.worker.flusher.hdd.threads | 1 | Flusher's thread count per disk 
used for write data to HDD disks. | 0.2.0 | 
+| celeborn.worker.flusher.hdfs.threads | 4 | Flusher's thread count used for 
write data to HDFS. | 0.2.0 | 
+| celeborn.worker.flusher.shutdownTimeout | 3s | Timeout for a flusher to 
shutdown. | 0.2.0 | 
+| celeborn.worker.flusher.ssd.threads | 8 | Flusher's thread count per disk 
used for write data to SSD disks. | 0.2.0 | 
+| celeborn.worker.graceful.shutdown.checkSlotsFinished.interval | 1s | The 
wait interval of checking whether all released slots to be committed or 
destroyed during worker graceful shutdown | 0.2.0 | 
+| celeborn.worker.graceful.shutdown.checkSlotsFinished.timeout | 480s | The 
wait time of waiting for the released slots to be committed or destroyed during 
worker graceful shutdown. | 0.2.0 | 
+| celeborn.worker.graceful.shutdown.enabled | false | When true, during worker 
shutdown, the worker will wait for all released slots to be committed or 
destroyed. | 0.2.0 | 
+| celeborn.worker.graceful.shutdown.partitionSorter.shutdownTimeout | 120s | 
The wait time of waiting for sorting partition files during worker graceful 
shutdown. | 0.2.0 | 
+| celeborn.worker.graceful.shutdown.recoverPath | &lt;tmp&gt;/recover | The 
path to store levelDB. | 0.2.0 | 
+| celeborn.worker.graceful.shutdown.timeout | 600s | The worker's graceful 
shutdown timeout time. | 0.2.0 | 
+| celeborn.worker.heartbeat.timeout | 120s | Worker heartbeat timeout. | 0.2.0 
| 
+| celeborn.worker.memory.checkInterval | 10ms | Interval of worker direct 
memory checking. | 0.2.0 | 
+| celeborn.worker.memory.reportInterval | 10s | Interval of worker direct 
memory tracker reporting to log. | 0.2.0 | 
+| celeborn.worker.metrics.prometheus.host | 0.0.0.0 | Worker's Prometheus 
host. | 0.2.0 | 
+| celeborn.worker.metrics.prometheus.port | 9096 | Worker's Prometheus port. | 
0.2.0 | 
+| celeborn.worker.monitor.disk.checkInterval | 60s | Intervals between device 
monitor to check disk. | 0.2.0 | 
+| celeborn.worker.monitor.disk.checklist | readwrite,diskusage | Monitor type 
for disk, available items are: iohang, readwrite and diskusage. | 0.2.0 | 
+| celeborn.worker.monitor.disk.enabled | true | When true, worker will monitor 
device and report to master. | 0.2.0 | 
+| celeborn.worker.monitor.disk.sys.block.dir | /sys/block | The directory 
where linux file block information is stored. | 0.2.0 | 
+| celeborn.worker.noneEmptyDirExpireDuration | 1d | If a non-empty application 
shuffle data dir have not been operated during le duration time, will mark this 
application as expired. | 0.2.0 | 
+| celeborn.worker.partitionSorter.directMemoryRatioThreshold | 0.1 | Max ratio 
of partition sorter's memory for sorting, when reserved memory is higher than 
max partition sorter memory, partition sorter will stop sorting. | 0.2.0 | 
+| celeborn.worker.partitionSorter.reservedMemoryPerPartition | 1mb | Initial 
reserve memory when sorting a shuffle file off-heap. | 0.2.0 | 
+| celeborn.worker.partitionSorter.sort.timeout | 220s | Timeout for a shuffle 
file to sort. | 0.2.0 | 
+| celeborn.worker.push.io.threads | &lt;undefined&gt; | Netty IO thread number 
of worker to handle client push data. The default threads number is 
`size(celeborn.worker.storage.dirs)*2`. | 0.2.0 | 
+| celeborn.worker.push.port | 0 | Server port for Worker to receive push data 
request from ShuffleClient. | 0.2.0 | 
+| celeborn.worker.register.timeout | 180s | Worker register timeout. | 0.2.0 | 
+| celeborn.worker.replicate.fastFail.duration | 60s | If a replicate request 
not replied during the duration, worker will mark the replicate data request as 
failed. | 0.2.0 | 
+| celeborn.worker.replicate.io.threads | &lt;undefined&gt; | Netty IO thread 
number of worker to replicate shuffle data. The default threads number is 
`size(celeborn.worker.storage.dirs)*2`. | 0.2.0 | 
+| celeborn.worker.replicate.port | 0 | Server port for Worker to receive 
replicate data request from other Workers. | 0.2.0 | 
+| celeborn.worker.replicate.threads | 64 | Thread number of worker to 
replicate shuffle data. | 0.2.0 | 
+| celeborn.worker.rpc.port | 0 | Server port for Worker to receive RPC 
request. | 0.2.0 | 
+| celeborn.worker.shuffle.commit.timeout | 120s | Timeout for a Celeborn 
worker to commit files of a shuffle. | 0.2.0 | 
+| celeborn.worker.storage.baseDir.number | 16 | How many directories will be 
used if `celeborn.worker.storage.dirs` is not set. The directory name is a 
combination of `celeborn.worker.storage.baseDir.prefix` and from one(inclusive) 
to `celeborn.worker.storage.baseDir.number`(inclusive) step by one. | 0.2.0 | 
+| celeborn.worker.storage.baseDir.prefix | /mnt/disk | Base directory for 
Celeborn worker to write if `celeborn.worker.storage.dirs` is not set. | 0.2.0 
| 
+| celeborn.worker.storage.dirs | &lt;undefined&gt; | Directory list to store 
shuffle data. It's recommended to configure one directory on each disk. Storage 
size limit can be set for each directory. For the sake of performance, there 
should be no more than 2 flush threads on the same disk partition if you are 
using HDD, and should be 8 or more flush threads on the same disk partition if 
you are using SSD. For example: 
`dir1[:capacity=][:disktype=][:flushthread=],dir2[:capacity=][:disktyp [...]
+| celeborn.worker.workingDir | hadoop/rss-worker/shuffle_data | Worker's 
working dir path name. | 0.2.0 | 
+| celeborn.worker.writer.close.timeout | 120s | Timeout for a file writer to 
close | 0.2.0 | 
+| celeborn.worker.writer.create.maxAttempts | 3 | Retry count for a file 
writer to create if its creation was failed. | 0.2.0 | 
+<!--end-include-->
diff --git a/docs/contrib/docs_and_website.md b/docs/contrib/docs_and_website.md
new file mode 100644
index 0000000..4d61941
--- /dev/null
+++ b/docs/contrib/docs_and_website.md
@@ -0,0 +1,66 @@
+---
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+Docs and Website
+===
+
+## Setup Python
+
+Follow the [Python official 
document](https://wiki.python.org/moin/BeginnersGuide) to install.
+
+## Setup `pyenv` on macOS (optional)
+
+Optionally, recommend to manage Python environments by 
[pyenv](https://github.com/pyenv/pyenv).
+
+Install from Homebrew
+
+```bash
+brew install pyenv pyenv-virtualenv
+```
+
+Setup in `~/.zshrc`
+
+```bash
+eval "$(pyenv init -)"
+eval "$(pyenv virtualenv-init -)"
+```
+
+Install `virtualenv`
+
+```bash
+pyenv install 3.9.13
+pyenv virtualenv 3.9.13 rss
+```
+
+Localize `virtualenv`
+
+```bash
+pyenv local rss
+```
+
+## Install dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+## Preview website
+
+```
+mkdocs serve
+```
+
+Open [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in browser.
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..578acbf
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,27 @@
+---
+hide:
+  - navigation
+
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+Overview
+===
+
+Celeborn is dedicated to improving the efficiency and elasticity of
+different map-reduce engines. RSS provides an elastic and high efficient 
management service
+for shuffle data.
+
+The current stable version is {{ stable_version }}
diff --git a/docs/storage-usage-quota.md b/docs/storage-usage-quota.md
new file mode 100644
index 0000000..3fb1b68
--- /dev/null
+++ b/docs/storage-usage-quota.md
@@ -0,0 +1,79 @@
+---
+hide:
+- navigation
+
+license: |
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+---
+
+Storage Usage Quota Guide
+===
+This documentation describes how RSS limits user's storage resource usage by 
setting quota.
+
+## Quota limitation
+
+RSS has a configurable user storage quota system. This allows RSS admin to 
manage each user's
+max resource usage to keep RSS cluster more stable. This feature can avoid RSS 
cluster resources
+being occupied by the minority with huge applications.
+
+RSS cluster's worker will collect each user's resource consumption and report 
the information
+to master in register and heartbeat message. When `LifecycleManager` register 
shuffle to RSS master,
+will check current user's resource usage, if used resource exceed the quota 
setting,
+shuffle will be fallback to ESS.
+
+## Storage resource
+
+Currently, RSS support two storage levels:
+  1. Local disk.
+  2. HDFS.
+
+And there are two levels of resources: 
+  1. written bytes.
+  2. written file numbers.
+
+So, now we support four setting about quota:
+
+  1. diskBytesWritten
+  2. diskFileCount
+  3. hdfsBytesWritten
+  4. hdfsFileCount
+
+If not set, default quota value is `-1`, means there is no limit for this user.
+
+## Configuration
+
+The quota system is configured via a configuration yaml file that RSS expects 
to be present at
+`$RSS_HOME/conf/quota.yaml`.  A custom file location can be specified via the
+`rss.quota.configuration.path` configuration property. The quota yaml 
configuration
+file should be organized as a list, each part setting one user's quota. In the 
`quota` section, set each quota's threshold value.
+Notice: quota value should be numeric value that can be cast to `Long` type.
+For example:
+
+```text
+-  tenantId: AAA
+   name: Tom
+   quota:
+     diskBytesWritten: 10000
+     diskFileCount: 200
+     hdfsBytesWritten: -1
+     hdfsFileCount: -1
+
+-  tenantId: BBB
+   name: Jerry
+   quota:
+     diskBytesWritten: -1
+     diskFileCount: -1
+     hdfsBytesWritten: 10000
+     hdfsFileCount: 200
+```
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..cfa3d72
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,56 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+site_name: Apache Celeborn (Incubating)
+repo_name: apache/incubating-celeborn
+repo_url: https://github.com/apache/incubating-celeborn
+
+plugins:
+  - search
+  - macros
+  - include-markdown:
+      opening_tag: "{!"
+      closing_tag: "!}"
+
+theme:
+  name: material
+  language: en
+  features:
+    - navigation.indexes
+    - navigation.tabs
+    - navigation.tabs.sticky
+    - navigation.top
+    - navigation.tracking
+
+markdown_extensions:
+  - admonition
+  - attr_list
+  - def_list
+  - md_in_html
+
+extra:
+  version: 0.2.0-SNAPSHOT
+  stable_version: 0.1.2
+  social:
+    - icon: fontawesome/brands/github
+
+nav:
+  - Home: index.md
+  - Configuration:
+      - configuration/index.md
+  - Contributor:
+      - Docs and Website: contrib/docs_and_website.md
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f7c1ac4
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+click==8.1.3
+ghp-import==2.1.0
+importlib-metadata==4.12.0
+Jinja2==3.1.2
+Markdown==3.3.7
+MarkupSafe==2.1.1
+mergedeep==1.3.4
+mkdocs==1.3.1
+mkdocs-include-markdown-plugin==3.8.1
+mkdocs-macros-plugin==0.7.0
+mkdocs-material==8.3.9
+mkdocs-material-extensions==1.0.3
+packaging==21.3
+Pygments==2.12.0
+pymdown-extensions==9.5
+pyparsing==3.0.9
+python-dateutil==2.8.2
+PyYAML==6.0
+pyyaml_env_tag==0.1
+six==1.16.0
+termcolor==1.1.0
+watchdog==2.1.9
+zipp==3.8.1

Reply via email to