This is an automated email from the ASF dual-hosted git repository.

chengpan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git


The following commit(s) were added to refs/heads/main by this push:
     new aa3bb0ac3 [CELEBORN-679] Optimize `Utils#bytesToString`
aa3bb0ac3 is described below

commit aa3bb0ac3b11d3fffeadcd6104e9c3792860c7da
Author: Fu Chen <[email protected]>
AuthorDate: Wed Jun 14 17:42:16 2023 +0800

    [CELEBORN-679] Optimize `Utils#bytesToString`
    
    ### What changes were proposed in this pull request?
    
    refer to https://github.com/apache/spark/pull/40301
    
    1. Optimize `Utils.bytesToString`. Arithmetic ops on BigInt and BigDecimal 
are order(s) of magnitude slower than the ops on primitive types. Division is 
an especially slow operation and it is used en masse here.
    
    2. According to the information sourced from 
[Wikipedia](https://en.wikipedia.org/wiki/Kilobyte), it is established that 
1000 is the appropriate factor for representing kilobytes (KB), while 1024 is 
the correct factor for kibibytes (KiB). In alignment with this understanding, 
changing the size unit from "KB" to "KiB".
    
    ### Why are the changes needed?
    
    the Utils#bytesToString method is frequently employed in memory-related log 
messages.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No, only perf improvement.
    
    ### How was this patch tested?
    
    existing UT and manually tested.
    
    Closes #1590 from cfmcgrady/bytesToString.
    
    Authored-by: Fu Chen <[email protected]>
    Signed-off-by: Cheng Pan <[email protected]>
---
 .../org/apache/celeborn/common/util/Utils.scala    | 49 ++++++++++------------
 .../celeborn/common/meta/WorkerInfoSuite.scala     |  2 +-
 .../apache/celeborn/common/quota/QuotaSuite.scala  |  2 +-
 .../apache/celeborn/common/util/UtilsSuite.scala   |  6 +--
 docs/configuration/columnar-shuffle.md             |  2 +-
 5 files changed, 27 insertions(+), 34 deletions(-)

diff --git a/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala 
b/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
index 4010e353c..158d4287c 100644
--- a/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
+++ b/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
@@ -93,38 +93,31 @@ object Utils extends Logging {
     (JavaUtils.byteStringAsBytes(str) / 1024 / 1024).toInt
   }
 
-  def bytesToString(size: Long): String = bytesToString(BigInt(size))
+  private[this] val siByteSizes =
+    Array(1L << 60, 1L << 50, 1L << 40, 1L << 30, 1L << 20, 1L << 10, 1)
+  private[this] val siByteSuffixes =
+    Array("EiB", "PiB", "TiB", "GiB", "MiB", "KiB", "B")
+
+  /**
+   * Convert a quantity in bytes to a human-readable string such as "4.0 MiB".
+   */
+  def bytesToString(size: Long): String = {
+    var i = 0
+    while (i < siByteSizes.length - 1 && size < 2 * siByteSizes(i)) i += 1
+    "%.1f %s".formatLocal(Locale.US, size.toDouble / siByteSizes(i), 
siByteSuffixes(i))
+  }
 
   def bytesToString(size: BigInt): String = {
-    val EB = 1L << 60
-    val PB = 1L << 50
-    val TB = 1L << 40
-    val GB = 1L << 30
-    val MB = 1L << 20
-    val KB = 1L << 10
-
-    if (size >= BigInt(1L << 11) * EB) {
+    val EiB = 1L << 60
+    if (size.isValidLong) {
+      // Common case, most sizes fit in 64 bits and all ops on BigInt are 
order(s) of magnitude
+      // slower than Long/Double.
+      bytesToString(size.toLong)
+    } else if (size < BigInt(2L << 10) * EiB) {
+      "%.1f EiB".formatLocal(Locale.US, BigDecimal(size) / EiB)
+    } else {
       // The number is too large, show it in scientific notation.
       BigDecimal(size, new MathContext(3, RoundingMode.HALF_UP)).toString() + 
" B"
-    } else {
-      val (value, unit) = {
-        if (size >= 2 * EB) {
-          (BigDecimal(size) / EB, "EB")
-        } else if (size >= 2 * PB) {
-          (BigDecimal(size) / PB, "PB")
-        } else if (size >= 2 * TB) {
-          (BigDecimal(size) / TB, "TB")
-        } else if (size >= 2 * GB) {
-          (BigDecimal(size) / GB, "GB")
-        } else if (size >= 2 * MB) {
-          (BigDecimal(size) / MB, "MB")
-        } else if (size >= 2 * KB) {
-          (BigDecimal(size) / KB, "KB")
-        } else {
-          (BigDecimal(size), "B")
-        }
-      }
-      "%.1f %s".formatLocal(Locale.US, value, unit)
     }
   }
 
diff --git 
a/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala 
b/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala
index 23f1a36c2..2a8b41b0c 100644
--- 
a/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala
+++ 
b/common/src/test/scala/org/apache/celeborn/common/meta/WorkerInfoSuite.scala
@@ -303,7 +303,7 @@ class WorkerInfoSuite extends CelebornFunSuite {
          |  DiskInfo1: DiskInfo(maxSlots: 0, committed shuffles 0 
shuffleAllocations: Map(), mountPoint: disk1, usableSpace: 2147483647, 
avgFlushTime: 1, avgFetchTime: 1, activeSlots: 10) status: HEALTHY dirs 
$placeholder
          |  DiskInfo2: DiskInfo(maxSlots: 0, committed shuffles 0 
shuffleAllocations: Map(), mountPoint: disk2, usableSpace: 2147483647, 
avgFlushTime: 2, avgFetchTime: 2, activeSlots: 20) status: HEALTHY dirs 
$placeholder
          |UserResourceConsumption: $placeholder
-         |  UserIdentifier: `tenant1`.`name1`, ResourceConsumption: 
ResourceConsumption(diskBytesWritten: 20.0 MB, diskFileCount: 1, 
hdfsBytesWritten: 50.0 MB, hdfsFileCount: 1)
+         |  UserIdentifier: `tenant1`.`name1`, ResourceConsumption: 
ResourceConsumption(diskBytesWritten: 20.0 MiB, diskFileCount: 1, 
hdfsBytesWritten: 50.0 MiB, hdfsFileCount: 1)
          |WorkerRef: NettyRpcEndpointRef(rss://mockRpc@localhost:12345)
          |""".stripMargin;
 
diff --git 
a/common/src/test/scala/org/apache/celeborn/common/quota/QuotaSuite.scala 
b/common/src/test/scala/org/apache/celeborn/common/quota/QuotaSuite.scala
index 11666ce2e..39d58d281 100644
--- a/common/src/test/scala/org/apache/celeborn/common/quota/QuotaSuite.scala
+++ b/common/src/test/scala/org/apache/celeborn/common/quota/QuotaSuite.scala
@@ -49,7 +49,7 @@ class QuotaSuite extends BaseQuotaManagerSuite {
       false,
       "User `mock`.`mock` used diskBytesWritten (1000.0 B) exceeds quota 
(100.0 B). " +
         "User `mock`.`mock` used diskFileCount(2000) exceeds quota(200). " +
-        "User `mock`.`mock` used hdfsBytesWritten(2.9 KB) exceeds quota(300.0 
B). " +
+        "User `mock`.`mock` used hdfsBytesWritten(2.9 KiB) exceeds quota(300.0 
B). " +
         "User `mock`.`mock` used hdfsFileCount(4000) exceeds quota(400). ")
 
     assert(res1 == exp1)
diff --git 
a/common/src/test/scala/org/apache/celeborn/common/util/UtilsSuite.scala 
b/common/src/test/scala/org/apache/celeborn/common/util/UtilsSuite.scala
index 28f4c1a2c..566392b2a 100644
--- a/common/src/test/scala/org/apache/celeborn/common/util/UtilsSuite.scala
+++ b/common/src/test/scala/org/apache/celeborn/common/util/UtilsSuite.scala
@@ -55,9 +55,9 @@ class UtilsSuite extends CelebornFunSuite {
   }
 
   test("bytesToString") {
-    assert("16.0 KB" == Utils.bytesToString(16384))
-    assert("16.0 MB" == Utils.bytesToString(16777216))
-    assert("16.0 GB" == Utils.bytesToString(17179869184L))
+    assert("16.0 KiB" == Utils.bytesToString(16384))
+    assert("16.0 MiB" == Utils.bytesToString(16777216))
+    assert("16.0 GiB" == Utils.bytesToString(17179869184L))
   }
 
   test("extractHostPortFromRssUrl") {
diff --git a/docs/configuration/columnar-shuffle.md 
b/docs/configuration/columnar-shuffle.md
index abdd4b3e8..c3581605d 100644
--- a/docs/configuration/columnar-shuffle.md
+++ b/docs/configuration/columnar-shuffle.md
@@ -23,6 +23,6 @@ license: |
 | celeborn.columnarShuffle.codegen.enabled | false | Whether to use codegen 
for columnar-based shuffle. | 0.3.0 | 
 | celeborn.columnarShuffle.enabled | false | Whether to enable columnar-based 
shuffle. | 0.2.0 | 
 | celeborn.columnarShuffle.encoding.dictionary.enabled | false | Whether to 
use dictionary encoding for columnar-based shuffle data. | 0.3.0 | 
-| celeborn.columnarShuffle.encoding.dictionary.maxFactor | 0.3 | Max factor 
for dictionary size. The max dictionary size is `min(32.0 KB, 
celeborn.columnarShuffle.batch.size * 
celeborn.columnar.shuffle.encoding.dictionary.maxFactor)`. | 0.3.0 | 
+| celeborn.columnarShuffle.encoding.dictionary.maxFactor | 0.3 | Max factor 
for dictionary size. The max dictionary size is `min(32.0 KiB, 
celeborn.columnarShuffle.batch.size * 
celeborn.columnar.shuffle.encoding.dictionary.maxFactor)`. | 0.3.0 | 
 | celeborn.columnarShuffle.offHeap.enabled | false | Whether to use off heap 
columnar vector. | 0.3.0 | 
 <!--end-include-->

Reply via email to