[GitHub] [iceberg] rdblue commented on a change in pull request #1287: Vectorized Reads of Parquet with Identity Partitions

GitBox Mon, 03 Aug 2020 10:50:46 -0700


rdblue commented on a change in pull request #1287:
URL: https://github.com/apache/iceberg/pull/1287#discussion_r464569489




##########
File path: 
spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java
##########
@@ -110,24 +113,52 @@ public static void stopSpark() {
   private Table table = null;
   private Dataset<Row> logs = null;
 
-  @Before
-  public void setupTable() throws Exception {
+  /*
+  Use the Hive Based table to make Identity Partition Columns with no 
duplication of the data in the underlying
+  parquet files. This makes sure that if the identity mapping fails, the test 
will also fail.
+   */
+  private void setupParquet() throws Exception {
     File location = temp.newFolder("logs");
+    File hiveLocation = temp.newFolder("hive");
+    String hiveTable = "hivetable";
     Assert.assertTrue("Temp folder should exist", location.exists());
 
     Map<String, String> properties = 
ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format);
-    this.table = TABLES.create(LOG_SCHEMA, spec, properties, 
location.toString());
     this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", 
"date", "level", "message");
+    spark.sql(String.format("DROP TABLE IF EXISTS %s", hiveTable));
+    logs.orderBy("date", "level", "id").write().partitionBy("date", 
"level").format("parquet")
+        .option("path", hiveLocation.toString()).saveAsTable(hiveTable);
+
+    this.table = TABLES.create(SparkSchemaUtil.schemaForTable(spark, 
hiveTable),
+        SparkSchemaUtil.specForTable(spark, hiveTable), properties, 
location.toString());
+
+    SparkTableUtil.importSparkTable(spark, new TableIdentifier(hiveTable), 
table, location.toString());
+  }
 
-    logs.orderBy("date", "level", 
"id").write().format("iceberg").mode("append").save(location.toString());
+  @Before
+  public void setupTable() throws Exception {
+    if (format.equals("parquet")) {
+      setupParquet();
+    } else {
+      File location = temp.newFolder("logs");
+      Assert.assertTrue("Temp folder should exist", location.exists());
+
+      Map<String, String> properties = 
ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format);
+      this.table = TABLES.create(LOG_SCHEMA, spec, properties, 
location.toString());
+      this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", 
"date", "level", "message");
+
+      logs.orderBy("date", "level", 
"id").write().format("iceberg").mode("append").save(location.toString());
+    }
   }
 
   @Test
   public void testFullProjection() {
     List<Row> expected = logs.orderBy("id").collectAsList();
     List<Row> actual = spark.read().format("iceberg")
         .option("vectorization-enabled", String.valueOf(vectorized))
-        .load(table.location()).orderBy("id").collectAsList();
+        .load(table.location()).orderBy("id")
+        .select("id", "date", "level", "message")

Review comment:
       Isn't this the default? Why was it necessary to add `select`?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] rdblue commented on a change in pull request #1287: Vectorized Reads of Parquet with Identity Partitions

Reply via email to