ryan-johnson-databricks commented on code in PR #40982:
URL: https://github.com/apache/spark/pull/40982#discussion_r1191590888


##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => 
false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {
+          return
+        }
+      }
+    }
+
+    override def load(key: String): String = {
+      startLoading.release()
+      if (Thread.currentThread().getName.contains("test-executor1")) {
+        waitUntilThread2Waiting()
+      }
+      if (intentionalFail.get) throw new RuntimeException(FAIL_MESSAGE)
+      key
+    }
+  }
+}
+
+/**
+ * Test non-fate-sharing behavior
+ */
+class NonFateSharingCacheSuite extends SparkFunSuite {
+
+  type WorkerFunc = () => Unit
+
+  import NonFateSharingCacheSuite._
+
+  test("test LoadingCache") {
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(TEST_KEY)
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test LoadingCache mix usage of default loader and provided loader") {
+    // Intentionally mix usage of default loader and provided value loader.
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test Cache") {
+    val loader = new TestCacheLoader
+    val cache = NonFateSharingCache(CacheBuilder.newBuilder.build[String, 
String])
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    testImpl(cache, loader, thread1Task, thread2Task)
+  }
+
+  def testImpl(
+    cache: NonFateSharingCache[String, String],
+    loader: TestCacheLoader,
+    thread1Task: WorkerFunc,
+    thread2Task: WorkerFunc): Unit = {
+    val executor1 = ThreadUtils.newDaemonSingleThreadExecutor("test-executor1")
+    val executor2 = ThreadUtils.newDaemonSingleThreadExecutor("test-executor2")
+    val f1 = executor1.submit(new Runnable {
+      override def run(): Unit = {
+        thread1Task()

Review Comment:
   nit: can be a one-liner:
   ```scala
   override def run(): Unit = thread1Task()
   ```
   (but first see runnable interface comment below)



##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => 
false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {
+          return
+        }
+      }
+    }
+
+    override def load(key: String): String = {
+      startLoading.release()
+      if (Thread.currentThread().getName.contains("test-executor1")) {
+        waitUntilThread2Waiting()
+      }
+      if (intentionalFail.get) throw new RuntimeException(FAIL_MESSAGE)
+      key
+    }
+  }
+}
+
+/**
+ * Test non-fate-sharing behavior
+ */
+class NonFateSharingCacheSuite extends SparkFunSuite {
+
+  type WorkerFunc = () => Unit
+
+  import NonFateSharingCacheSuite._
+
+  test("test LoadingCache") {
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(TEST_KEY)
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test LoadingCache mix usage of default loader and provided loader") {
+    // Intentionally mix usage of default loader and provided value loader.
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test Cache") {
+    val loader = new TestCacheLoader
+    val cache = NonFateSharingCache(CacheBuilder.newBuilder.build[String, 
String])
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    testImpl(cache, loader, thread1Task, thread2Task)
+  }
+
+  def testImpl(
+    cache: NonFateSharingCache[String, String],
+    loader: TestCacheLoader,
+    thread1Task: WorkerFunc,
+    thread2Task: WorkerFunc): Unit = {

Review Comment:
   nit: indent these 4 spaces?



##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => 
false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {
+          return
+        }
+      }
+    }
+
+    override def load(key: String): String = {
+      startLoading.release()
+      if (Thread.currentThread().getName.contains("test-executor1")) {
+        waitUntilThread2Waiting()
+      }
+      if (intentionalFail.get) throw new RuntimeException(FAIL_MESSAGE)
+      key
+    }
+  }
+}
+
+/**
+ * Test non-fate-sharing behavior
+ */
+class NonFateSharingCacheSuite extends SparkFunSuite {
+
+  type WorkerFunc = () => Unit
+
+  import NonFateSharingCacheSuite._
+
+  test("test LoadingCache") {
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(TEST_KEY)
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test LoadingCache mix usage of default loader and provided loader") {
+    // Intentionally mix usage of default loader and provided value loader.
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)

Review Comment:
   nit: any particular reason for the line breaks?
   ```scala
   loadingCache.get(TEST_KEY, () => loader.load(TEST_KEY))
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala:
##########
@@ -1577,7 +1577,7 @@ object CodeGenerator extends Logging {
    * automatically, in order to constrain its memory footprint.  Note that 
this cache does not use
    * weak keys/values and thus does not respond to memory pressure.
    */
-  private val cache = CacheBuilder.newBuilder()
+  private val cache = NonFateSharingCache(CacheBuilder.newBuilder()

Review Comment:
   I don't see any comment explaining why codegen needs a non fate sharing 
cache? What bug does it fix?



##########
core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala:
##########
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.concurrent.Callable
+
+import com.google.common.cache.Cache
+import com.google.common.cache.LoadingCache
+
+/**
+ * SPARK-43300: Guava cache fate-sharing behavior might lead to unexpected 
cascade failure:
+ * when multiple threads access the same key in the cache at the same time 
when the key is not in
+ * the cache, Guava cache will block all requests and load the data only once. 
If the loading fails,
+ * all requests will fail immediately without retry. Therefore individual 
failure will also fail
+ * other irrelevant queries who are waiting for the same key.
+ *
+ * This util create a delegation Cache with KeyLock to synchronize threads 
looking for the same key
+ * so that they should run individually and fail as if they had arrived one at 
a time.
+ *
+ * Instead of implementing Guava Cache and LoadingCache interface, we defined 
our own narrower APIs
+ * so that we can control at compile time what cache operations are allowed. 
Feel free to add new
+ * APIs when needed.

Review Comment:
   I don't think the "feel free to..." comment is especially helpful... can 
probably just remove it.



##########
core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala:
##########
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.concurrent.Callable
+
+import com.google.common.cache.Cache
+import com.google.common.cache.LoadingCache
+
+/**
+ * SPARK-43300: Guava cache fate-sharing behavior might lead to unexpected 
cascade failure:
+ * when multiple threads access the same key in the cache at the same time 
when the key is not in
+ * the cache, Guava cache will block all requests and load the data only once. 
If the loading fails,
+ * all requests will fail immediately without retry. Therefore individual 
failure will also fail
+ * other irrelevant queries who are waiting for the same key.
+ *
+ * This util create a delegation Cache with KeyLock to synchronize threads 
looking for the same key
+ * so that they should run individually and fail as if they had arrived one at 
a time.
+ *
+ * Instead of implementing Guava Cache and LoadingCache interface, we defined 
our own narrower APIs
+ * so that we can control at compile time what cache operations are allowed. 
Feel free to add new
+ * APIs when needed.
+ */
+object NonFateSharingCache {
+  def apply[K, V](cache: Cache[K, V]): NonFateSharingCache[K, V] = cache match 
{

Review Comment:
   Worth a quick doc comment to explain that this method will return a 
`NonFateSharingLoadingCache` if the user happens to pass a `LoadingCache`, as a 
courtesy to the user?



##########
core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala:
##########
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.concurrent.Callable
+
+import com.google.common.cache.Cache
+import com.google.common.cache.LoadingCache
+
+/**
+ * SPARK-43300: Guava cache fate-sharing behavior might lead to unexpected 
cascade failure:
+ * when multiple threads access the same key in the cache at the same time 
when the key is not in
+ * the cache, Guava cache will block all requests and load the data only once. 
If the loading fails,
+ * all requests will fail immediately without retry. Therefore individual 
failure will also fail
+ * other irrelevant queries who are waiting for the same key.
+ *
+ * This util create a delegation Cache with KeyLock to synchronize threads 
looking for the same key
+ * so that they should run individually and fail as if they had arrived one at 
a time.
+ *
+ * Instead of implementing Guava Cache and LoadingCache interface, we defined 
our own narrower APIs
+ * so that we can control at compile time what cache operations are allowed. 
Feel free to add new
+ * APIs when needed.

Review Comment:
   Meanwhile, it would probably be helpful to explain _why_ we 
don't/can't/won't expose the full Guava cache API?



##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => 
false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {
+          return
+        }
+      }
+    }
+
+    override def load(key: String): String = {
+      startLoading.release()
+      if (Thread.currentThread().getName.contains("test-executor1")) {
+        waitUntilThread2Waiting()
+      }
+      if (intentionalFail.get) throw new RuntimeException(FAIL_MESSAGE)
+      key
+    }
+  }
+}
+
+/**
+ * Test non-fate-sharing behavior
+ */
+class NonFateSharingCacheSuite extends SparkFunSuite {
+
+  type WorkerFunc = () => Unit
+
+  import NonFateSharingCacheSuite._
+
+  test("test LoadingCache") {
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(TEST_KEY)
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test LoadingCache mix usage of default loader and provided loader") {
+    // Intentionally mix usage of default loader and provided value loader.
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test Cache") {
+    val loader = new TestCacheLoader
+    val cache = NonFateSharingCache(CacheBuilder.newBuilder.build[String, 
String])
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    testImpl(cache, loader, thread1Task, thread2Task)
+  }
+
+  def testImpl(
+    cache: NonFateSharingCache[String, String],
+    loader: TestCacheLoader,
+    thread1Task: WorkerFunc,
+    thread2Task: WorkerFunc): Unit = {
+    val executor1 = ThreadUtils.newDaemonSingleThreadExecutor("test-executor1")
+    val executor2 = ThreadUtils.newDaemonSingleThreadExecutor("test-executor2")
+    val f1 = executor1.submit(new Runnable {
+      override def run(): Unit = {
+        thread1Task()
+      }
+    })
+    val f2 = executor2.submit(new Runnable {
+      override def run(): Unit = {
+        loader.startLoading.acquire() // wait until thread1 start loading
+        THREAD2_HOLDER.set(Thread.currentThread())
+        thread2Task()
+      }
+    })

Review Comment:
   Double check, but I'm pretty sure that this can simplify because runnable is 
a callable interface:
   ```scala
   val f1 = executor1.submit(() => thread1Task())
   val f2 = executor1.submit { () =>
     ...
   }
   ```
   



##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => 
false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {
+          return
+        }
+      }
+    }
+
+    override def load(key: String): String = {
+      startLoading.release()
+      if (Thread.currentThread().getName.contains("test-executor1")) {
+        waitUntilThread2Waiting()
+      }
+      if (intentionalFail.get) throw new RuntimeException(FAIL_MESSAGE)
+      key
+    }
+  }
+}
+
+/**
+ * Test non-fate-sharing behavior
+ */
+class NonFateSharingCacheSuite extends SparkFunSuite {
+
+  type WorkerFunc = () => Unit
+
+  import NonFateSharingCacheSuite._
+
+  test("test LoadingCache") {
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(TEST_KEY)
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test LoadingCache mix usage of default loader and provided loader") {
+    // Intentionally mix usage of default loader and provided value loader.
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test Cache") {

Review Comment:
   Can we use more descriptive names for test cases? 
   Ideally it explains what behavior the test actually verifies.



##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => 
false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {

Review Comment:
   nit: use Option instead of null check?
   ```scala
   if 
(Option(THREAD2_HOLDER.get()).exists(_.getState.equals(Thread.State.WAITING))) {
   ```



##########
core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala:
##########
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.concurrent.Callable
+
+import com.google.common.cache.Cache
+import com.google.common.cache.LoadingCache
+
+/**
+ * SPARK-43300: Guava cache fate-sharing behavior might lead to unexpected 
cascade failure:
+ * when multiple threads access the same key in the cache at the same time 
when the key is not in
+ * the cache, Guava cache will block all requests and load the data only once. 
If the loading fails,
+ * all requests will fail immediately without retry. Therefore individual 
failure will also fail
+ * other irrelevant queries who are waiting for the same key.
+ *
+ * This util create a delegation Cache with KeyLock to synchronize threads 
looking for the same key
+ * so that they should run individually and fail as if they had arrived one at 
a time.
+ *
+ * Instead of implementing Guava Cache and LoadingCache interface, we defined 
our own narrower APIs

Review Comment:
   ```suggestion
    * Instead of implementing Guava Cache and LoadingCache interface, we expose 
a subset of APIs
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to