yuqi1129 opened a new issue, #8041:
URL: https://github.com/apache/gravitino/issues/8041
### Version
main branch
### Describe what's wrong
25/08/12 16:23:19 WARN MetricsConfig: Cannot locate configuration: tried
hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/08/12 16:23:24 WARN FileSystem: Failed to initialize fileystem
s3a://xiaoyu-bucket/test/example_with_dir: java.io.IOException: Class class
org.apache.gravitino.aws.shaded.org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider
does not implement AWSCredentialsProvider
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File
"/Users/yuqi/venv/lib/python3.9/site-packages/pyspark/sql/readwriter.py", line
1864, in csv
self._jwrite.csv(path)
File "/Users/yuqi/venv/lib/python3.9/site-packages/py4j/java_gateway.py",
line 1322, in __call__
return_value = get_return_value(
File
"/Users/yuqi/venv/lib/python3.9/site-packages/pyspark/errors/exceptions/captured.py",
line 179, in deco
return f(*a, **kw)
File "/Users/yuqi/venv/lib/python3.9/site-packages/py4j/protocol.py", line
326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o66.csv.
: org.apache.gravitino.exceptions.GravitinoRuntimeException: Exception
occurs when create new FileSystem for fileset:
test.s3_credential_catalog.schema.example_with_dir, location: null, msg: Class
class
org.apache.gravitino.aws.shaded.org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider
does not implement AWSCredentialsProvider
at
org.apache.gravitino.filesystem.hadoop.BaseGVFSOperations.lambda$getActualFileSystemByLocationName$1(BaseGVFSOperations.java:598)
at
org.apache.gravitino.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.lambda$doComputeIfAbsent$14(BoundedLocalCache.java:2406)
at
java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1916)
at
org.apache.gravitino.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.doComputeIfAbsent(BoundedLocalCache.java:2404)
at
org.apache.gravitino.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.computeIfAbsent(BoundedLocalCache.java:2387)
at
org.apache.gravitino.shaded.com.github.benmanes.caffeine.cache.LocalCache.computeIfAbsent(LocalCache.java:108)
at
org.apache.gravitino.shaded.com.github.benmanes.caffeine.cache.LocalManualCache.get(LocalManualCache.java:62)
at
org.apache.gravitino.filesystem.hadoop.BaseGVFSOperations.getActualFileSystemByLocationName(BaseGVFSOperations.java:567)
at
org.apache.gravitino.filesystem.hadoop.BaseGVFSOperations.getActualFileSystem(BaseGVFSOperations.java:526)
at
org.apache.gravitino.filesystem.hadoop.DefaultGVFSOperations.getFileStatus(DefaultGVFSOperations.java:148)
at
org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem.lambda$getFileStatus$5(GravitinoVirtualFileSystem.java:231)
at
org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem.runWithExceptionTranslation(GravitinoVirtualFileSystem.java:316)
at
org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem.getFileStatus(GravitinoVirtualFileSystem.java:230)
at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1760)
at
org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:120)
at
org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
at
org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
at
org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
at
org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
at
org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
at
org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
at
org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
at
org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:859)
at
org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:388)
at
org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:361)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:240)
at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:850)
at
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
at
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:568)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.io.IOException: Class class
org.apache.gravitino.aws.shaded.org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider
does not implement AWSCredentialsProvider
at
org.apache.hadoop.fs.s3a.S3AUtils.createAWSCredentialProvider(S3AUtils.java:662)
at
org.apache.hadoop.fs.s3a.S3AUtils.createAWSCredentialProviderSet(S3AUtils.java:605)
at
org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:257)
at
org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3469)
at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
at
org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
at org.apache.hadoop.fs.FileSystem$Cache.getUnique(FileSystem.java:3527)
at org.apache.hadoop.fs.FileSystem.newInstance(FileSystem.java:593)
at
org.apache.gravitino.s3.fs.S3FileSystemProvider.getFileSystem(S3FileSystemProvider.java:76)
at
org.apache.gravitino.filesystem.hadoop.BaseGVFSOperations.getActualFileSystemByPath(BaseGVFSOperations.java:644)
at
org.apache.gravitino.filesystem.hadoop.BaseGVFSOperations.lambda$getActualFileSystemByLocationName$1(BaseGVFSOperations.java:589)
... 55 more
### Error message and/or stacktrace
Please see above.
### How to reproduce
```python
import logging
logging.basicConfig(level=logging.INFO)
from gravitino import NameIdentifier, GravitinoClient, Catalog, Fileset,
GravitinoAdminClient
gravitino_url = "http://localhost:8090"
metalake_name = "test"
catalog_name = "s3_credential_catalog"
schema_name = "schema"
fileset_name = "example_with_dir"
fileset_ident = NameIdentifier.of(schema_name, fileset_name)
gravitino_admin_client = GravitinoAdminClient(uri=gravitino_url)
gravitino_client = GravitinoClient(uri=gravitino_url,
metalake_name=metalake_name)
from pyspark.sql import SparkSession
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars
/Users/yuqi/project/gravitino/bundles/aws/build/libs/gravitino-aws-1.0.0-SNAPSHOT.jar,/Users/yuqi/project/gravitino/clients/filesystem-hadoop3-runtime/build/libs/gravitino-filesystem-hadoop3-runtime-1.0.0-SNAPSHOT.jar,/Users/yuqi/Downloads/hadoop-aws-3.2.0.jar,/Users/yuqi/.m2/repository/com/amazonaws/aws-java-sdk-bundle/1.11.375/aws-java-sdk-bundle-1.11.375.jar
--master local[1] pyspark-shell"
spark = SparkSession.builder \
.appName("s3_fielset_test") \
.config("spark.hadoop.fs.AbstractFileSystem.gvfs.impl",
"org.apache.gravitino.filesystem.hadoop.Gvfs") \
.config("spark.hadoop.fs.gvfs.impl",
"org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem") \
.config("spark.hadoop.fs.gravitino.server.uri", "http://localhost:8090")
\
.config("spark.hadoop.fs.gravitino.client.metalake", "test") \
.config("spark.hadoop.s3-access-key-id", os.environ["S3_ACCESS_KEY_ID"])
\
.config("spark.hadoop.s3-secret-access-key",
os.environ["S3_SECRET_ACCESS_KEY"]) \
.config("spark.hadoop.s3-endpoint",
"http://s3.ap-northeast-1.amazonaws.com") \
.config("spark.driver.memory", "2g") \
.config("spark.driver.port", "2048") \
.getOrCreate()
spark.sparkContext.setLogLevel("DEBUG")
data = [("Alice", 25), ("Bob", 30), ("Cathy", 45)]
columns = ["Name", "Age"]
spark_df = spark.createDataFrame(data, schema=columns)
gvfs_path =
f"gvfs://fileset/{catalog_name}/{schema_name}/{fileset_name}/people"
spark_df.coalesce(1).write \
.mode("overwrite") \
.option("header", "true") \
.csv(gvfs_path)
```
### Additional context
_No response_
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]