This is an automated email from the ASF dual-hosted git repository.
hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 0e9aba4a68 [CORE] Use component file to discover components (#8271)
0e9aba4a68 is described below
commit 0e9aba4a685ae041c46ca72fe01da045cacd8b44
Author: Hongze Zhang <[email protected]>
AuthorDate: Thu Dec 19 13:13:41 2024 +0800
[CORE] Use component file to discover components (#8271)
---
.../org.apache.gluten.component.CHIcebergComponent | 0
.../services/org.apache.gluten.component.Component | 1 -
....apache.gluten.backendsapi.clickhouse.CHBackend | 0
.../services/org.apache.gluten.backend.Backend | 1 -
...g.apache.gluten.component.VeloxIcebergComponent | 0
.../services/org.apache.gluten.component.Component | 1 -
...rg.apache.gluten.backendsapi.velox.VeloxBackend | 0
.../services/org.apache.gluten.backend.Backend | 1 -
.../java/org/apache/gluten/utils/ResourceUtil.java | 113 +++++++++++++++++++++
.../org/apache/gluten/component/Discovery.scala | 86 ++++++++++++++++
.../org/apache/gluten/component/package.scala | 9 +-
.../gluten/backendsapi/BackendsApiManager.scala | 12 +--
.../org/apache/gluten/integration/BaseMixin.java | 3 +
13 files changed, 209 insertions(+), 18 deletions(-)
diff --git
a/backends-clickhouse/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.CHIcebergComponent
b/backends-clickhouse/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.CHIcebergComponent
new file mode 100644
index 0000000000..e69de29bb2
diff --git
a/backends-clickhouse/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
b/backends-clickhouse/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
deleted file mode 100644
index a13f6fa739..0000000000
---
a/backends-clickhouse/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.gluten.component.CHIcebergComponent
diff --git
a/backends-clickhouse/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.clickhouse.CHBackend
b/backends-clickhouse/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.clickhouse.CHBackend
new file mode 100644
index 0000000000..e69de29bb2
diff --git
a/backends-clickhouse/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
b/backends-clickhouse/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
deleted file mode 100644
index bcd3cb1c03..0000000000
---
a/backends-clickhouse/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.gluten.backendsapi.clickhouse.CHBackend
diff --git
a/backends-velox/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.VeloxIcebergComponent
b/backends-velox/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.VeloxIcebergComponent
new file mode 100644
index 0000000000..e69de29bb2
diff --git
a/backends-velox/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
b/backends-velox/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
deleted file mode 100644
index e9e844c6bb..0000000000
---
a/backends-velox/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.gluten.component.VeloxIcebergComponent
diff --git
a/backends-velox/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.velox.VeloxBackend
b/backends-velox/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.velox.VeloxBackend
new file mode 100644
index 0000000000..e69de29bb2
diff --git
a/backends-velox/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
b/backends-velox/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
deleted file mode 100644
index 7cc9b39591..0000000000
---
a/backends-velox/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.gluten.backendsapi.velox.VeloxBackend
diff --git
a/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java
b/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java
new file mode 100644
index 0000000000..692a91af26
--- /dev/null
+++ b/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.utils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.List;
+import java.util.regex.Pattern;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipException;
+import java.util.zip.ZipFile;
+
+/**
+ * Code is copied from <a
+ *
href="https://stackoverflow.com/questions/3923129/get-a-list-of-resources-from-classpath-directory">here</a>
+ * and then modified for Gluten's use.
+ */
+public class ResourceUtil {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(ResourceUtil.class);
+
+ /**
+ * Get a collection of resource paths by the input RegEx pattern.
+ *
+ * @param pattern The pattern to match.
+ * @return The relative resource paths in the order they are found.
+ */
+ public static List<String> getResources(final Pattern pattern) {
+ final List<String> buffer = new ArrayList<>();
+ final String classPath = System.getProperty("java.class.path");
+ final String[] classPathElements = classPath.split(File.pathSeparator);
+ for (final String element : classPathElements) {
+ getResources(element, pattern, buffer);
+ }
+ return Collections.unmodifiableList(buffer);
+ }
+
+ private static void getResources(
+ final String element, final Pattern pattern, final List<String> buffer) {
+ final File file = new File(element);
+ if (!file.exists()) {
+ LOG.info("Skip non-existing classpath: {}", element);
+ return;
+ }
+ if (file.isDirectory()) {
+ getResourcesFromDirectory(file, file, pattern, buffer);
+ } else {
+ getResourcesFromJarFile(file, pattern, buffer);
+ }
+ }
+
+ private static void getResourcesFromJarFile(
+ final File file, final Pattern pattern, final List<String> buffer) {
+ ZipFile zf;
+ try {
+ zf = new ZipFile(file);
+ } catch (final ZipException e) {
+ throw new RuntimeException(e);
+ } catch (final IOException e) {
+ throw new RuntimeException(e);
+ }
+ final Enumeration e = zf.entries();
+ while (e.hasMoreElements()) {
+ final ZipEntry ze = (ZipEntry) e.nextElement();
+ final String fileName = ze.getName();
+ final boolean accept = pattern.matcher(fileName).matches();
+ if (accept) {
+ buffer.add(fileName);
+ }
+ }
+ try {
+ zf.close();
+ } catch (final IOException e1) {
+ throw new RuntimeException(e1);
+ }
+ }
+
+ private static void getResourcesFromDirectory(
+ final File root, final File directory, final Pattern pattern, final
List<String> buffer) {
+ final File[] fileList = directory.listFiles();
+ for (final File file : fileList) {
+ if (file.isDirectory()) {
+ getResourcesFromDirectory(root, file, pattern, buffer);
+ } else {
+ final String relative =
root.toURI().relativize(file.toURI()).getPath();
+ final boolean accept = pattern.matcher(relative).matches();
+ if (accept) {
+ buffer.add(relative);
+ }
+ }
+ }
+ }
+}
diff --git
a/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala
b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala
new file mode 100644
index 0000000000..2b8f060a69
--- /dev/null
+++ b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gluten.component
+
+import org.apache.gluten.exception.GlutenException
+import org.apache.gluten.utils.ResourceUtil
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.SparkReflectionUtil
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.util.matching.Regex
+
+
+
+
+// format: off
+/**
+ * Gluten's global discovery to find all [[Component]] definitions in the
classpath.
+ *
+ * We don't use [[java.util.ServiceLoader]] since it requires all the service
files to have
+ * the same file name which is the class name of [[Component]], this causes
the service files
+ * easily be overwritten by each other during Maven build. Typically, See code
of
+ * `DefaultMavenFileFilter` used by Maven's `maven-resources-plugin`.
+ *
+ * Instead, Gluten defines its own way to register components. For example,
placing the following
+ * component files to resource folder:
+ *
+ * META-INF
+ * \- gluten-components
+ * |- org.apache.gluten.component.AComponent
+ * \- org.apache.gluten.backend.BBackend
+ *
+ * Will cause the registration of component `AComponent` and backend
`BBackend`.
+ *
+ * The content in a component file is not read so doesn't matter at the moment.
+ */
+// format: on
+private object Discovery extends Logging {
+ private val container: String = "META-INF/gluten-components"
+ private val componentFilePattern: Regex = s"^$container/(.+)$$".r
+
+ def discoverAll(): Seq[Component] = {
+ logInfo("Start discovering components in the current classpath... ")
+ val prev = System.currentTimeMillis()
+ val allFiles =
ResourceUtil.getResources(componentFilePattern.pattern).asScala
+ val duration = System.currentTimeMillis() - prev
+ logInfo(s"Discovered component files: ${allFiles.mkString(", ")}.
Duration: $duration ms.")
+ val deDup = mutable.Set[String]()
+ val out = allFiles.flatMap {
+ case componentFilePattern(className) =>
+ if (!deDup.add(className)) {
+ logWarning(s"Found duplicated component class $className in then
classpath, ignoring.")
+ None
+ } else {
+ val clazz =
+ try {
+ SparkReflectionUtil.classForName(className)
+ } catch {
+ case e: ClassNotFoundException =>
+ throw new GlutenException(s"Component class not found:
$className", e)
+ }
+ val instance =
clazz.getDeclaredConstructor().newInstance().asInstanceOf[Component]
+ Some(instance)
+ }
+ case _ => None
+ }.toSeq
+ out
+ }
+}
diff --git
a/gluten-core/src/main/scala/org/apache/gluten/component/package.scala
b/gluten-core/src/main/scala/org/apache/gluten/component/package.scala
index f74b967294..032a32d041 100644
--- a/gluten-core/src/main/scala/org/apache/gluten/component/package.scala
+++ b/gluten-core/src/main/scala/org/apache/gluten/component/package.scala
@@ -16,15 +16,10 @@
*/
package org.apache.gluten
-import org.apache.gluten.backend.Backend
-
import org.apache.spark.internal.Logging
-import java.util.ServiceLoader
import java.util.concurrent.atomic.AtomicBoolean
-import scala.collection.JavaConverters._
-
package object component extends Logging {
private val allComponentsLoaded: AtomicBoolean = new AtomicBoolean(false)
@@ -34,9 +29,7 @@ package object component extends Logging {
}
// Load all components in classpath.
- val discoveredBackends = ServiceLoader.load(classOf[Backend]).asScala
- val discoveredComponents = ServiceLoader.load(classOf[Component]).asScala
- val all = discoveredBackends ++ discoveredComponents
+ val all = Discovery.discoverAll()
// Register all components.
all.foreach(_.ensureRegistered())
diff --git
a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala
b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala
index 3b4e97afb3..4b6f674905 100644
---
a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala
+++
b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala
@@ -21,17 +21,17 @@ import org.apache.gluten.component.Component
object BackendsApiManager {
private lazy val backend: SubstraitBackend = initializeInternal()
- /** Initialize all backends api. */
+ /** Initialize all backends apis. */
private def initializeInternal(): SubstraitBackend = {
val loadedSubstraitBackends =
Component.sorted().filter(_.isInstanceOf[SubstraitBackend])
- assert(loadedSubstraitBackends.size == 1, "More than one Substrait
backends are loaded")
+ assert(
+ loadedSubstraitBackends.size == 1,
+ s"Zero or more than one Substrait backends are loaded: " +
+ s"${loadedSubstraitBackends.map(_.name()).mkString(", ")}")
loadedSubstraitBackends.head.asInstanceOf[SubstraitBackend]
}
- /**
- * Automatically detect the backend api.
- * @return
- */
+ /** Automatically detect the backend api. */
def initialize(): String = {
getBackendName
}
diff --git
a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
index 08c55d78a6..b369fffd74 100644
---
a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
+++
b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
@@ -22,6 +22,7 @@ import org.apache.gluten.integration.command.SparkRunModes;
import org.apache.gluten.integration.ds.TpcdsSuite;
import org.apache.gluten.integration.h.TpchSuite;
import org.apache.log4j.Level;
+import org.apache.log4j.LogManager;
import org.apache.spark.SparkConf;
import picocli.CommandLine;
import scala.Predef;
@@ -120,6 +121,8 @@ public class BaseMixin {
throw new IllegalArgumentException("Log level not found: " + logLevel);
}
+ LogManager.getRootLogger().setLevel(level);
+
scala.collection.immutable.Map<String, String> extraSparkConfScala =
JavaConverters.mapAsScalaMapConverter(
mergeMapSafe(extraSparkConf,
runModeEnumeration.extraSparkConf())).asScala().toMap(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]