This is an automated email from the ASF dual-hosted git repository.
chengpan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new aa3bb0ac3 [CELEBORN-679] Optimize `Utils#bytesToString`
aa3bb0ac3 is described below
commit aa3bb0ac3b11d3fffeadcd6104e9c3792860c7da
Author: Fu Chen <[email protected]>
AuthorDate: Wed Jun 14 17:42:16 2023 +0800
[CELEBORN-679] Optimize `Utils#bytesToString`
### What changes were proposed in this pull request?
refer to https://github.com/apache/spark/pull/40301
1. Optimize `Utils.bytesToString`. Arithmetic ops on BigInt and BigDecimal
are order(s) of magnitude slower than the ops on primitive types. Division is
an especially slow operation and it is used en masse here.
2. According to the information sourced from
[Wikipedia](https://en.wikipedia.org/wiki/Kilobyte), it is established that
1000 is the appropriate factor for representing kilobytes (KB), while 1024 is
the correct factor for kibibytes (KiB). In alignment with this understanding,
changing the size unit from "KB" to "KiB".
### Why are the changes needed?
the Utils#bytesToString method is frequently employed in memory-related log
messages.
### Does this PR introduce _any_ user-facing change?
No, only perf improvement.
### How was this patch tested?
existing UT and manually tested.
Closes #1590 from cfmcgrady/bytesToString.
Authored-by: Fu Chen <[email protected]>
Signed-off-by: Cheng Pan <[email protected]>
---
.../org/apache/celeborn/common/util/Utils.scala | 49 ++++++++++------------
.../celeborn/common/meta/WorkerInfoSuite.scala | 2 +-
.../apache/celeborn/common/quota/QuotaSuite.scala | 2 +-
.../apache/celeborn/common/util/UtilsSuite.scala | 6 +--
docs/configuration/columnar-shuffle.md | 2 +-
5 files changed, 27 insertions(+), 34 deletions(-)
diff --git a/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
b/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
index 4010e353c..158d4287c 100644
--- a/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
+++ b/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
@@ -93,38 +93,31 @@ object Utils extends Logging {
(JavaUtils.byteStringAsBytes(str) / 1024 / 1024).toInt
}
- def bytesToString(size: Long): String = bytesToString(BigInt(size))
+ private[this] val siByteSizes =
+ Array(1L << 60, 1L << 50, 1L << 40, 1L << 30, 1L << 20, 1L << 10, 1)
+ private[this] val siByteSuffixes =
+ Array("EiB", "PiB", "TiB", "GiB", "MiB", "KiB", "B")
+
+ /**
+ * Convert a quantity in bytes to a human-readable string such as "4.0 MiB".
+ */
+ def bytesToString(size: Long): String = {
+ var i = 0
+ while (i < siByteSizes.length - 1 && size < 2 * siByteSizes(i)) i += 1
+ "%.1f %s".formatLocal(Locale.US, size.toDouble / siByteSizes(i),
siByteSuffixes(i))
+ }
def bytesToString(size: BigInt): String = {
- val EB = 1L << 60
- val PB = 1L << 50
- val TB = 1L << 40
- val GB = 1L << 30
- val MB = 1L << 20
- val KB = 1L << 10
-
- if (size >= BigInt(1L << 11) * EB) {
+ val EiB = 1L << 60
+ if (size.isValidLong) {
+ // Common case, most sizes fit in 64 bits and all ops on BigInt are
order(s) of magnitude
+ // slower than Long/Double.
+ bytesToString(size.toLong)
+ } else if (size < BigInt(2L << 10) * EiB) {
+ "%.1f EiB".formatLocal(Locale.US, BigDecimal(size) / EiB)
+ } else {
// The number is too large, show it in scientific notation.
BigDecimal(size, new MathContext(3, RoundingMode.HALF_UP)).toString() +
" B"
- } else {
- val (value, unit) = {
- if (size >= 2 * EB) {
- (BigDecimal(size) / EB, "EB")
- } else if (size >= 2 * PB) {
- (BigDecimal(size) / PB, "PB")
- } else if (size >= 2 * TB) {
- (BigDecimal(size) / TB, "TB")
- } else if (size >= 2 * GB) {
- (BigDecimal(size) / GB, "GB")
- } else if (size >= 2 * MB) {
- (BigDecimal(size) / MB, "MB")
- } else if (size >= 2 * KB) {
- (BigDecimal(size) / KB, "KB")
- } else {
- (BigDecimal(size), "B")
- }
- }
- "%.1f %s".formatLocal(Locale.US, value, unit)
}
}
diff --git
a/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala
b/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala
index 23f1a36c2..2a8b41b0c 100644
---
a/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala
+++
b/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala
@@ -303,7 +303,7 @@ class WorkerInfoSuite extends CelebornFunSuite {
| DiskInfo1: DiskInfo(maxSlots: 0, committed shuffles 0
shuffleAllocations: Map(), mountPoint: disk1, usableSpace: 2147483647,
avgFlushTime: 1, avgFetchTime: 1, activeSlots: 10) status: HEALTHY dirs
$placeholder
| DiskInfo2: DiskInfo(maxSlots: 0, committed shuffles 0
shuffleAllocations: Map(), mountPoint: disk2, usableSpace: 2147483647,
avgFlushTime: 2, avgFetchTime: 2, activeSlots: 20) status: HEALTHY dirs
$placeholder
|UserResourceConsumption: $placeholder
- | UserIdentifier: `tenant1`.`name1`, ResourceConsumption:
ResourceConsumption(diskBytesWritten: 20.0 MB, diskFileCount: 1,
hdfsBytesWritten: 50.0 MB, hdfsFileCount: 1)
+ | UserIdentifier: `tenant1`.`name1`, ResourceConsumption:
ResourceConsumption(diskBytesWritten: 20.0 MiB, diskFileCount: 1,
hdfsBytesWritten: 50.0 MiB, hdfsFileCount: 1)
|WorkerRef: NettyRpcEndpointRef(rss://mockRpc@localhost:12345)
|""".stripMargin;
diff --git
a/common/src/test/scala/org/apache/celeborn/common/quota/QuotaSuite.scala
b/common/src/test/scala/org/apache/celeborn/common/quota/QuotaSuite.scala
index 11666ce2e..39d58d281 100644
--- a/common/src/test/scala/org/apache/celeborn/common/quota/QuotaSuite.scala
+++ b/common/src/test/scala/org/apache/celeborn/common/quota/QuotaSuite.scala
@@ -49,7 +49,7 @@ class QuotaSuite extends BaseQuotaManagerSuite {
false,
"User `mock`.`mock` used diskBytesWritten (1000.0 B) exceeds quota
(100.0 B). " +
"User `mock`.`mock` used diskFileCount(2000) exceeds quota(200). " +
- "User `mock`.`mock` used hdfsBytesWritten(2.9 KB) exceeds quota(300.0
B). " +
+ "User `mock`.`mock` used hdfsBytesWritten(2.9 KiB) exceeds quota(300.0
B). " +
"User `mock`.`mock` used hdfsFileCount(4000) exceeds quota(400). ")
assert(res1 == exp1)
diff --git
a/common/src/test/scala/org/apache/celeborn/common/util/UtilsSuite.scala
b/common/src/test/scala/org/apache/celeborn/common/util/UtilsSuite.scala
index 28f4c1a2c..566392b2a 100644
--- a/common/src/test/scala/org/apache/celeborn/common/util/UtilsSuite.scala
+++ b/common/src/test/scala/org/apache/celeborn/common/util/UtilsSuite.scala
@@ -55,9 +55,9 @@ class UtilsSuite extends CelebornFunSuite {
}
test("bytesToString") {
- assert("16.0 KB" == Utils.bytesToString(16384))
- assert("16.0 MB" == Utils.bytesToString(16777216))
- assert("16.0 GB" == Utils.bytesToString(17179869184L))
+ assert("16.0 KiB" == Utils.bytesToString(16384))
+ assert("16.0 MiB" == Utils.bytesToString(16777216))
+ assert("16.0 GiB" == Utils.bytesToString(17179869184L))
}
test("extractHostPortFromRssUrl") {
diff --git a/docs/configuration/columnar-shuffle.md
b/docs/configuration/columnar-shuffle.md
index abdd4b3e8..c3581605d 100644
--- a/docs/configuration/columnar-shuffle.md
+++ b/docs/configuration/columnar-shuffle.md
@@ -23,6 +23,6 @@ license: |
| celeborn.columnarShuffle.codegen.enabled | false | Whether to use codegen
for columnar-based shuffle. | 0.3.0 |
| celeborn.columnarShuffle.enabled | false | Whether to enable columnar-based
shuffle. | 0.2.0 |
| celeborn.columnarShuffle.encoding.dictionary.enabled | false | Whether to
use dictionary encoding for columnar-based shuffle data. | 0.3.0 |
-| celeborn.columnarShuffle.encoding.dictionary.maxFactor | 0.3 | Max factor
for dictionary size. The max dictionary size is `min(32.0 KB,
celeborn.columnarShuffle.batch.size *
celeborn.columnar.shuffle.encoding.dictionary.maxFactor)`. | 0.3.0 |
+| celeborn.columnarShuffle.encoding.dictionary.maxFactor | 0.3 | Max factor
for dictionary size. The max dictionary size is `min(32.0 KiB,
celeborn.columnarShuffle.batch.size *
celeborn.columnar.shuffle.encoding.dictionary.maxFactor)`. | 0.3.0 |
| celeborn.columnarShuffle.offHeap.enabled | false | Whether to use off heap
columnar vector. | 0.3.0 |
<!--end-include-->