zhztheplayer commented on code in PR #5447:
URL: https://github.com/apache/incubator-gluten/pull/5447#discussion_r1578851690
##########
gluten-data/src/main/scala/org/apache/gluten/utils/ArrowUtil.scala:
##########
@@ -86,4 +101,223 @@ object ArrowUtil extends Logging {
}
new Schema(fields)
}
+
+ def getFormat(format: String): FileFormat = {
+ format match {
+ case "parquet" => FileFormat.PARQUET
+ case "orc" => FileFormat.ORC
+ case "csv" => FileFormat.CSV
+ case _ => throw new IllegalArgumentException("Unrecognizable format")
+ }
+ }
+
+ def getFormat(format:
org.apache.spark.sql.execution.datasources.FileFormat): FileFormat = {
+ format match {
+ case _: ParquetFileFormat =>
+ FileFormat.PARQUET
+ case _: CSVFileFormat =>
+ FileFormat.CSV
+ case _ =>
+ throw new IllegalArgumentException("Unrecognizable format")
+ }
+ }
+
+ private def rewriteUri(encodeUri: String): String = {
+ val decodedUri = encodeUri
+ val uri = URI.create(decodedUri)
+ if (uri.getScheme == "s3" || uri.getScheme == "s3a") {
+ val s3Rewritten =
+ new URI("s3", uri.getAuthority, uri.getPath, uri.getQuery,
uri.getFragment).toString
+ return s3Rewritten
+ }
+ val sch = uri.getScheme match {
+ case "hdfs" => "hdfs"
+ case "file" => "file"
+ }
+ val ssp = uri.getScheme match {
+ case "hdfs" => uri.getSchemeSpecificPart
+ case "file" => "//" + uri.getSchemeSpecificPart
+ }
+ val rewritten = new URI(sch, ssp, uri.getFragment)
+ rewritten.toString
+ }
+
+ def makeArrowDiscovery(encodedUri: String, format: FileFormat):
FileSystemDatasetFactory = {
+ val allocator = ArrowBufferAllocators.contextInstance()
+ val factory = new FileSystemDatasetFactory(
+ allocator,
+ NativeMemoryPool.getDefault, // TODO: wait to change
Review Comment:
> We can try `NativeMemoryPool#createListenable`. Though in Gluten we
already had a listener named `ReservationListener` so we should do some tricks
on naming.
This can be done in separate PR if we can leave some TODO notes in code.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]