[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #624: Update SparkTableUtil to use SessionCatalog and proper MetricsConfig

GitBox Fri, 08 Nov 2019 17:15:23 -0800

rdblue commented on a change in pull request #624: Update SparkTableUtil to use 
SessionCatalog and proper MetricsConfig
URL: https://github.com/apache/incubator-iceberg/pull/624#discussion_r344420236


 ##########
 File path: spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
 ##########
 @@ -27,86 +27,153 @@ import org.apache.hadoop.fs.{Path, PathFilter}
 import org.apache.iceberg.{DataFile, DataFiles, FileFormat, ManifestFile, 
ManifestWriter}
 import org.apache.iceberg.{Metrics, MetricsConfig, PartitionSpec, Table}
 import org.apache.iceberg.exceptions.NoSuchTableException
-import org.apache.iceberg.hadoop.{HadoopFileIO, HadoopInputFile, HadoopTables, 
SerializableConfiguration}
+import org.apache.iceberg.hadoop.{HadoopFileIO, HadoopInputFile, 
SerializableConfiguration}
 import org.apache.iceberg.orc.OrcMetrics
 import org.apache.iceberg.parquet.ParquetUtil
-import org.apache.iceberg.spark.hacks.Hive
 import org.apache.parquet.hadoop.ParquetFileReader
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.CatalogTablePartition
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.catalog.{CatalogTable, 
CatalogTablePartition}
+import org.apache.spark.sql.catalyst.expressions.Expression
 import scala.collection.JavaConverters._
+import scala.util.Try
 
 object SparkTableUtil {
   /**
-   * Returns a DataFrame with a row for each partition in the table.
-   *
-   * The DataFrame has 3 columns, partition key (a=1/b=2), partition location, 
and format
-   * (avro or parquet).
+   * Returns all partitions in the table.
    *
    * @param spark a Spark session
    * @param table a table name and (optional) database
-   * @return a DataFrame of the table's partitions
+   * @return all table's partitions
    */
-  def partitionDF(spark: SparkSession, table: String): DataFrame = {
-    import spark.implicits._
+  def getPartitions(spark: SparkSession, table: String): Seq[SparkPartition] = 
{
+    val tableIdentifier = 
spark.sessionState.sqlParser.parseTableIdentifier(table)
+    getPartitions(spark, tableIdentifier)
+  }
 
-    val partitions: Seq[(Map[String, String], Option[String], Option[String])] 
=
-      Hive.partitions(spark, table).map { p: CatalogTablePartition =>
-        (p.spec, p.storage.locationUri.map(String.valueOf(_)), p.storage.serde)
-      }
+  /**
+   * Returns all partitions in the table.
+   *
+   * @param spark a Spark session
+   * @param tableIdent a table identifier
+   * @return all table's partitions
+   */
+  def getPartitions(spark: SparkSession, tableIdent: TableIdentifier): 
Seq[SparkPartition] = {
 
 Review comment:
   Nevermind, we use it elsewhere.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #624: Update SparkTableUtil to use SessionCatalog and proper MetricsConfig

Reply via email to