This is an automated email from the ASF dual-hosted git repository.

hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 0e9aba4a68 [CORE] Use component file to discover components (#8271)
0e9aba4a68 is described below

commit 0e9aba4a685ae041c46ca72fe01da045cacd8b44
Author: Hongze Zhang <[email protected]>
AuthorDate: Thu Dec 19 13:13:41 2024 +0800

    [CORE] Use component file to discover components (#8271)
---
 .../org.apache.gluten.component.CHIcebergComponent |   0
 .../services/org.apache.gluten.component.Component |   1 -
 ....apache.gluten.backendsapi.clickhouse.CHBackend |   0
 .../services/org.apache.gluten.backend.Backend     |   1 -
 ...g.apache.gluten.component.VeloxIcebergComponent |   0
 .../services/org.apache.gluten.component.Component |   1 -
 ...rg.apache.gluten.backendsapi.velox.VeloxBackend |   0
 .../services/org.apache.gluten.backend.Backend     |   1 -
 .../java/org/apache/gluten/utils/ResourceUtil.java | 113 +++++++++++++++++++++
 .../org/apache/gluten/component/Discovery.scala    |  86 ++++++++++++++++
 .../org/apache/gluten/component/package.scala      |   9 +-
 .../gluten/backendsapi/BackendsApiManager.scala    |  12 +--
 .../org/apache/gluten/integration/BaseMixin.java   |   3 +
 13 files changed, 209 insertions(+), 18 deletions(-)

diff --git 
a/backends-clickhouse/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.CHIcebergComponent
 
b/backends-clickhouse/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.CHIcebergComponent
new file mode 100644
index 0000000000..e69de29bb2
diff --git 
a/backends-clickhouse/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
 
b/backends-clickhouse/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
deleted file mode 100644
index a13f6fa739..0000000000
--- 
a/backends-clickhouse/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.gluten.component.CHIcebergComponent
diff --git 
a/backends-clickhouse/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.clickhouse.CHBackend
 
b/backends-clickhouse/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.clickhouse.CHBackend
new file mode 100644
index 0000000000..e69de29bb2
diff --git 
a/backends-clickhouse/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
 
b/backends-clickhouse/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
deleted file mode 100644
index bcd3cb1c03..0000000000
--- 
a/backends-clickhouse/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.gluten.backendsapi.clickhouse.CHBackend
diff --git 
a/backends-velox/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.VeloxIcebergComponent
 
b/backends-velox/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.VeloxIcebergComponent
new file mode 100644
index 0000000000..e69de29bb2
diff --git 
a/backends-velox/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
 
b/backends-velox/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
deleted file mode 100644
index e9e844c6bb..0000000000
--- 
a/backends-velox/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.gluten.component.VeloxIcebergComponent
diff --git 
a/backends-velox/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.velox.VeloxBackend
 
b/backends-velox/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.velox.VeloxBackend
new file mode 100644
index 0000000000..e69de29bb2
diff --git 
a/backends-velox/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
 
b/backends-velox/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
deleted file mode 100644
index 7cc9b39591..0000000000
--- 
a/backends-velox/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.gluten.backendsapi.velox.VeloxBackend
diff --git 
a/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java 
b/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java
new file mode 100644
index 0000000000..692a91af26
--- /dev/null
+++ b/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.utils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.List;
+import java.util.regex.Pattern;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipException;
+import java.util.zip.ZipFile;
+
+/**
+ * Code is copied from <a
+ * 
href="https://stackoverflow.com/questions/3923129/get-a-list-of-resources-from-classpath-directory";>here</a>
+ * and then modified for Gluten's use.
+ */
+public class ResourceUtil {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(ResourceUtil.class);
+
+  /**
+   * Get a collection of resource paths by the input RegEx pattern.
+   *
+   * @param pattern The pattern to match.
+   * @return The relative resource paths in the order they are found.
+   */
+  public static List<String> getResources(final Pattern pattern) {
+    final List<String> buffer = new ArrayList<>();
+    final String classPath = System.getProperty("java.class.path");
+    final String[] classPathElements = classPath.split(File.pathSeparator);
+    for (final String element : classPathElements) {
+      getResources(element, pattern, buffer);
+    }
+    return Collections.unmodifiableList(buffer);
+  }
+
+  private static void getResources(
+      final String element, final Pattern pattern, final List<String> buffer) {
+    final File file = new File(element);
+    if (!file.exists()) {
+      LOG.info("Skip non-existing classpath: {}", element);
+      return;
+    }
+    if (file.isDirectory()) {
+      getResourcesFromDirectory(file, file, pattern, buffer);
+    } else {
+      getResourcesFromJarFile(file, pattern, buffer);
+    }
+  }
+
+  private static void getResourcesFromJarFile(
+      final File file, final Pattern pattern, final List<String> buffer) {
+    ZipFile zf;
+    try {
+      zf = new ZipFile(file);
+    } catch (final ZipException e) {
+      throw new RuntimeException(e);
+    } catch (final IOException e) {
+      throw new RuntimeException(e);
+    }
+    final Enumeration e = zf.entries();
+    while (e.hasMoreElements()) {
+      final ZipEntry ze = (ZipEntry) e.nextElement();
+      final String fileName = ze.getName();
+      final boolean accept = pattern.matcher(fileName).matches();
+      if (accept) {
+        buffer.add(fileName);
+      }
+    }
+    try {
+      zf.close();
+    } catch (final IOException e1) {
+      throw new RuntimeException(e1);
+    }
+  }
+
+  private static void getResourcesFromDirectory(
+      final File root, final File directory, final Pattern pattern, final 
List<String> buffer) {
+    final File[] fileList = directory.listFiles();
+    for (final File file : fileList) {
+      if (file.isDirectory()) {
+        getResourcesFromDirectory(root, file, pattern, buffer);
+      } else {
+        final String relative = 
root.toURI().relativize(file.toURI()).getPath();
+        final boolean accept = pattern.matcher(relative).matches();
+        if (accept) {
+          buffer.add(relative);
+        }
+      }
+    }
+  }
+}
diff --git 
a/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala 
b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala
new file mode 100644
index 0000000000..2b8f060a69
--- /dev/null
+++ b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gluten.component
+
+import org.apache.gluten.exception.GlutenException
+import org.apache.gluten.utils.ResourceUtil
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.SparkReflectionUtil
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.util.matching.Regex
+
+
+
+
+// format: off
+/**
+ * Gluten's global discovery to find all [[Component]] definitions in the 
classpath.
+ *
+ * We don't use [[java.util.ServiceLoader]] since it requires all the service 
files to have
+ * the same file name which is the class name of [[Component]], this causes 
the service files
+ * easily be overwritten by each other during Maven build. Typically, See code 
of
+ * `DefaultMavenFileFilter` used by Maven's `maven-resources-plugin`.
+ *
+ * Instead, Gluten defines its own way to register components. For example, 
placing the following
+ * component files to resource folder:
+ *
+ *  META-INF
+ *  \- gluten-components
+ *     |- org.apache.gluten.component.AComponent
+ *     \- org.apache.gluten.backend.BBackend
+ *
+ * Will cause the registration of component `AComponent` and backend 
`BBackend`.
+ *
+ * The content in a component file is not read so doesn't matter at the moment.
+ */
+// format: on
+private object Discovery extends Logging {
+  private val container: String = "META-INF/gluten-components"
+  private val componentFilePattern: Regex = s"^$container/(.+)$$".r
+
+  def discoverAll(): Seq[Component] = {
+    logInfo("Start discovering components in the current classpath... ")
+    val prev = System.currentTimeMillis()
+    val allFiles = 
ResourceUtil.getResources(componentFilePattern.pattern).asScala
+    val duration = System.currentTimeMillis() - prev
+    logInfo(s"Discovered component files: ${allFiles.mkString(", ")}. 
Duration: $duration ms.")
+    val deDup = mutable.Set[String]()
+    val out = allFiles.flatMap {
+      case componentFilePattern(className) =>
+        if (!deDup.add(className)) {
+          logWarning(s"Found duplicated component class $className in then 
classpath, ignoring.")
+          None
+        } else {
+          val clazz =
+            try {
+              SparkReflectionUtil.classForName(className)
+            } catch {
+              case e: ClassNotFoundException =>
+                throw new GlutenException(s"Component class not found: 
$className", e)
+            }
+          val instance = 
clazz.getDeclaredConstructor().newInstance().asInstanceOf[Component]
+          Some(instance)
+        }
+      case _ => None
+    }.toSeq
+    out
+  }
+}
diff --git 
a/gluten-core/src/main/scala/org/apache/gluten/component/package.scala 
b/gluten-core/src/main/scala/org/apache/gluten/component/package.scala
index f74b967294..032a32d041 100644
--- a/gluten-core/src/main/scala/org/apache/gluten/component/package.scala
+++ b/gluten-core/src/main/scala/org/apache/gluten/component/package.scala
@@ -16,15 +16,10 @@
  */
 package org.apache.gluten
 
-import org.apache.gluten.backend.Backend
-
 import org.apache.spark.internal.Logging
 
-import java.util.ServiceLoader
 import java.util.concurrent.atomic.AtomicBoolean
 
-import scala.collection.JavaConverters._
-
 package object component extends Logging {
   private val allComponentsLoaded: AtomicBoolean = new AtomicBoolean(false)
 
@@ -34,9 +29,7 @@ package object component extends Logging {
     }
 
     // Load all components in classpath.
-    val discoveredBackends = ServiceLoader.load(classOf[Backend]).asScala
-    val discoveredComponents = ServiceLoader.load(classOf[Component]).asScala
-    val all = discoveredBackends ++ discoveredComponents
+    val all = Discovery.discoverAll()
 
     // Register all components.
     all.foreach(_.ensureRegistered())
diff --git 
a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala
 
b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala
index 3b4e97afb3..4b6f674905 100644
--- 
a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala
+++ 
b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala
@@ -21,17 +21,17 @@ import org.apache.gluten.component.Component
 object BackendsApiManager {
   private lazy val backend: SubstraitBackend = initializeInternal()
 
-  /** Initialize all backends api. */
+  /** Initialize all backends apis. */
   private def initializeInternal(): SubstraitBackend = {
     val loadedSubstraitBackends = 
Component.sorted().filter(_.isInstanceOf[SubstraitBackend])
-    assert(loadedSubstraitBackends.size == 1, "More than one Substrait 
backends are loaded")
+    assert(
+      loadedSubstraitBackends.size == 1,
+      s"Zero or more than one Substrait backends are loaded: " +
+        s"${loadedSubstraitBackends.map(_.name()).mkString(", ")}")
     loadedSubstraitBackends.head.asInstanceOf[SubstraitBackend]
   }
 
-  /**
-   * Automatically detect the backend api.
-   * @return
-   */
+  /** Automatically detect the backend api. */
   def initialize(): String = {
     getBackendName
   }
diff --git 
a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
 
b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
index 08c55d78a6..b369fffd74 100644
--- 
a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
+++ 
b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
@@ -22,6 +22,7 @@ import org.apache.gluten.integration.command.SparkRunModes;
 import org.apache.gluten.integration.ds.TpcdsSuite;
 import org.apache.gluten.integration.h.TpchSuite;
 import org.apache.log4j.Level;
+import org.apache.log4j.LogManager;
 import org.apache.spark.SparkConf;
 import picocli.CommandLine;
 import scala.Predef;
@@ -120,6 +121,8 @@ public class BaseMixin {
         throw new IllegalArgumentException("Log level not found: " + logLevel);
     }
 
+    LogManager.getRootLogger().setLevel(level);
+
     scala.collection.immutable.Map<String, String> extraSparkConfScala =
         JavaConverters.mapAsScalaMapConverter(
             mergeMapSafe(extraSparkConf, 
runModeEnumeration.extraSparkConf())).asScala().toMap(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to