yuqi1129 opened a new issue, #8361: URL: https://github.com/apache/gravitino/issues/8361
### Version main branch ### Describe what's wrong - JDK17 - ./gradlew build -x test ```python import logging logging.basicConfig(level=logging.INFO) from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset, GravitinoAdminClient gravitino_url = "http://localhost:8090" metalake_name = "test" catalog_name = "s3_credential_catalog" schema_name = "schema" fileset_name = "example_with_dir" fileset_ident = NameIdentifier.of(schema_name, fileset_name) gravitino_admin_client = GravitinoAdminClient(uri=gravitino_url) gravitino_client = GravitinoClient(uri=gravitino_url, metalake_name=metalake_name) from pyspark.sql import SparkSession import os os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /Users/yuqi/project/graviton/bundles/aws/build/libs/gravitino-aws-1.0.0-SNAPSHOT.jar,/Users/yuqi/project/graviton/clients/filesystem-hadoop3-runtime/build/libs/gravitino-filesystem-hadoop3-runtime-1.0.0-SNAPSHOT.jar,/Users/yuqi/Downloads/hadoop-jars/hadoop-aws-3.2.0.jar,/Users/yuqi/.m2/repository/com/amazonaws/aws-java-sdk-bundle/1.11.375/aws-java-sdk-bundle-1.11.375.jar --conf \"spark.driver.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005\" --conf \"spark.executor.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5006\" --master local[1] pyspark-shell" spark = SparkSession.builder \ .appName("s3_fielset_test") \ .config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs") \ .config("spark.hadoop.fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") \ .config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090") \ .config("spark.hadoop.fs.gravitino.client.metalake", "test") \ .config("spark.hadoop.s3-access-key-id", os.environ["S3_ACCESS_KEY_ID"]) \ .config("spark.hadoop.s3-secret-access-key", os.environ["S3_SECRET_ACCESS_KEY"]) \ .config("spark.hadoop.s3-endpoint", "http://s3.ap-northeast-1.amazonaws.com") \ .config("spark.driver.memory", "2g") \ .config("spark.driver.port", "2048") \ .getOrCreate() spark.sparkContext.setLogLevel("DEBUG") data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)] columns = ["Name", "Age"] spark_df = spark.createDataFrame(data, schema=columns) gvfs_path = f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people" spark_df.coalesce(1).write \ .mode("overwrite") \ .option("header", "true") \ .csv(gvfs_path) ``` ### Error message and/or stacktrace py4j.protocol.Py4JJavaError: An error occurred while calling o83.csv. : java.lang.NoClassDefFoundError: org/apache/hadoop/fs/s3a/S3AFileSystem at org.apache.gravitino.s3.fs.S3FileSystemProvider.getFileSystem(S3FileSystemProvider.java:73) at org.apache.gravitino.filesystem.hadoop.BaseGVFSOperations.getActualFileSystemByPath(BaseGVFSOperations.java:667) at org.apache.gravitino.filesystem.hadoop.BaseGVFSOperations.lambda$getActualFileSystemByLocationName$5(BaseGVFSOperations.java:612) at org.apache.gravitino.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.lambda$doComputeIfAbsent$14(BoundedLocalCache.java:2406) at java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1916) at org.apache.gravitino.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.doComputeIfAbsent(BoundedLocalCache.java:2404) at org.apache.gravitino.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.computeIfAbsent(BoundedLocalCache.java:2387) at org.apache.gravitino.shaded.com.github.benmanes.caffeine.cache.LocalCache.computeIfAbsent(LocalCache.java:108) at org.apache.gravitino.shaded.com.github.benmanes.caffeine.cache.LocalManualCache.get(LocalManualCache.java:62) at org.apache.gravitino.filesystem.hadoop.BaseGVFSOperations.getActualFileSystemByLocationName(BaseGVFSOperations.java:590) at org.apache.gravitino.filesystem.hadoop.BaseGVFSOperations.getActualFileSystem(BaseGVFSOperations.java:528) at org.apache.gravitino.filesystem.hadoop.DefaultGVFSOperations.getFileStatus(DefaultGVFSOperations.java:148) at org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem.lambda$getFileStatus$5(GravitinoVirtualFileSystem.java:231) at org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem.runWithExceptionTranslation(GravitinoVirtualFileSystem.java:316) at org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem.getFileStatus(GravitinoVirtualFileSystem.java:230) at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1683) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:119) at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108) at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106) at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131) at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180) at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176) at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132) at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131) at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989) at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438) at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293) at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:979) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:568) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.base/java.lang.Thread.run(Thread.java:833) Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.fs.s3a.S3AFileSystem at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:641) at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:188) at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525) ... 49 more ### How to reproduce Please see above ### Additional context _No response_ -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
