Re: [PR] [GLUTEN-5414][VL] FEAT: Support read CSV [incubator-gluten]

via GitHub Wed, 24 Apr 2024 21:32:14 -0700


zhztheplayer commented on code in PR #5447:
URL: https://github.com/apache/incubator-gluten/pull/5447#discussion_r1578831193



##########
gluten-data/src/main/scala/org/apache/gluten/utils/ArrowUtil.scala:
##########
@@ -86,4 +101,223 @@ object ArrowUtil extends Logging {
     }
     new Schema(fields)
   }
+
+  def getFormat(format: String): FileFormat = {
+    format match {
+      case "parquet" => FileFormat.PARQUET
+      case "orc" => FileFormat.ORC
+      case "csv" => FileFormat.CSV
+      case _ => throw new IllegalArgumentException("Unrecognizable format")
+    }
+  }
+
+  def getFormat(format: 
org.apache.spark.sql.execution.datasources.FileFormat): FileFormat = {
+    format match {
+      case _: ParquetFileFormat =>
+        FileFormat.PARQUET
+      case _: CSVFileFormat =>
+        FileFormat.CSV
+      case _ =>
+        throw new IllegalArgumentException("Unrecognizable format")
+    }
+  }
+
+  private def rewriteUri(encodeUri: String): String = {
+    val decodedUri = encodeUri
+    val uri = URI.create(decodedUri)
+    if (uri.getScheme == "s3" || uri.getScheme == "s3a") {
+      val s3Rewritten =
+        new URI("s3", uri.getAuthority, uri.getPath, uri.getQuery, 
uri.getFragment).toString
+      return s3Rewritten
+    }
+    val sch = uri.getScheme match {
+      case "hdfs" => "hdfs"
+      case "file" => "file"
+    }
+    val ssp = uri.getScheme match {
+      case "hdfs" => uri.getSchemeSpecificPart
+      case "file" => "//" + uri.getSchemeSpecificPart
+    }
+    val rewritten = new URI(sch, ssp, uri.getFragment)
+    rewritten.toString
+  }
+
+  def makeArrowDiscovery(encodedUri: String, format: FileFormat): 
FileSystemDatasetFactory = {
+    val allocator = ArrowBufferAllocators.contextInstance()
+    val factory = new FileSystemDatasetFactory(
+      allocator,
+      NativeMemoryPool.getDefault, // TODO: wait to change

Review Comment:
   We can try `NativeMemoryPool#createListenable`. Though In Gluten we already 
had a listener named `ReservationListener` so we should do some tricks on 
naming.



##########
gluten-data/src/main/scala/org/apache/gluten/utils/ArrowUtil.scala:
##########
@@ -86,4 +101,223 @@ object ArrowUtil extends Logging {
     }
     new Schema(fields)
   }
+
+  def getFormat(format: String): FileFormat = {
+    format match {
+      case "parquet" => FileFormat.PARQUET
+      case "orc" => FileFormat.ORC
+      case "csv" => FileFormat.CSV
+      case _ => throw new IllegalArgumentException("Unrecognizable format")
+    }
+  }
+
+  def getFormat(format: 
org.apache.spark.sql.execution.datasources.FileFormat): FileFormat = {
+    format match {
+      case _: ParquetFileFormat =>
+        FileFormat.PARQUET
+      case _: CSVFileFormat =>
+        FileFormat.CSV
+      case _ =>
+        throw new IllegalArgumentException("Unrecognizable format")
+    }
+  }
+
+  private def rewriteUri(encodeUri: String): String = {
+    val decodedUri = encodeUri
+    val uri = URI.create(decodedUri)
+    if (uri.getScheme == "s3" || uri.getScheme == "s3a") {
+      val s3Rewritten =
+        new URI("s3", uri.getAuthority, uri.getPath, uri.getQuery, 
uri.getFragment).toString
+      return s3Rewritten
+    }
+    val sch = uri.getScheme match {
+      case "hdfs" => "hdfs"
+      case "file" => "file"
+    }
+    val ssp = uri.getScheme match {
+      case "hdfs" => uri.getSchemeSpecificPart
+      case "file" => "//" + uri.getSchemeSpecificPart
+    }
+    val rewritten = new URI(sch, ssp, uri.getFragment)
+    rewritten.toString
+  }
+
+  def makeArrowDiscovery(encodedUri: String, format: FileFormat): 
FileSystemDatasetFactory = {
+    val allocator = ArrowBufferAllocators.contextInstance()
+    val factory = new FileSystemDatasetFactory(
+      allocator,
+      NativeMemoryPool.getDefault, // TODO: wait to change

Review Comment:
   We can try `NativeMemoryPool#createListenable`. Though in Gluten we already 
had a listener named `ReservationListener` so we should do some tricks on 
naming.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [GLUTEN-5414][VL] FEAT: Support read CSV [incubator-gluten]

Reply via email to