[GitHub] [iceberg] szehon-ho commented on a diff in pull request #4456: Hive: Expose Snapshot Stats in HMS.

GitBox Thu, 07 Apr 2022 11:12:16 -0700


szehon-ho commented on code in PR #4456:
URL: https://github.com/apache/iceberg/pull/4456#discussion_r845436551



##########
hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java:
##########
@@ -90,6 +92,8 @@
   private static final String HIVE_LOCK_CHECK_MAX_WAIT_MS = 
"iceberg.hive.lock-check-max-wait-ms";
   private static final String HIVE_ICEBERG_METADATA_REFRESH_MAX_RETRIES = 
"iceberg.hive.metadata-refresh-max-retries";
   private static final String HIVE_TABLE_LEVEL_LOCK_EVICT_MS = 
"iceberg.hive.table-level-lock-evict-ms";
+  private static final String HIVE_TABLE_PARAMETER_SIZE_MAX = 
"iceberg.hive.table.parameter.size.max";
+  private static final long HIVE_TABLE_PARAMETER_SIZE_MAX_DEFAULT = 32672;

Review Comment:
   Should we add a comment, giving some reference to when it was changed in 
Hive?



##########
hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java:
##########
@@ -170,6 +175,7 @@ protected HiveTableOperations(Configuration conf, 
ClientPool metaClients, FileIO
         conf.getInt(HIVE_ICEBERG_METADATA_REFRESH_MAX_RETRIES, 
HIVE_ICEBERG_METADATA_REFRESH_MAX_RETRIES_DEFAULT);
     long tableLevelLockCacheEvictionTimeout =
         conf.getLong(HIVE_TABLE_LEVEL_LOCK_EVICT_MS, 
HIVE_TABLE_LEVEL_LOCK_EVICT_MS_DEFAULT);
+    this.maxHiveTableParameterSize = 
conf.getLong(HIVE_TABLE_PARAMETER_SIZE_MAX, 
HIVE_TABLE_PARAMETER_SIZE_MAX_DEFAULT);

Review Comment:
   Nit: can you move it up one line so not to break flow of the 
tableLevelLockCacheEvictionTimeout statement?



##########
hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCatalog.java:
##########
@@ -468,4 +478,83 @@ public void testUUIDinTableProperties() throws Exception {
       catalog.dropTable(tableIdentifier);
     }
   }
+
+  @Test
+  public void testSnapshotStatsTableProperties() throws Exception {
+    Schema schema = new Schema(
+        required(1, "id", Types.IntegerType.get(), "unique ID"),
+        required(2, "data", Types.StringType.get())
+    );
+    TableIdentifier tableIdentifier = TableIdentifier.of(DB_NAME, "tbl");
+    String location = temp.newFolder("tbl").toString();
+
+    try {
+      catalog.buildTable(tableIdentifier, schema)
+          .withLocation(location)
+          .create();
+
+      String tableName = tableIdentifier.name();
+      org.apache.hadoop.hive.metastore.api.Table hmsTable =
+          metastoreClient.getTable(tableIdentifier.namespace().level(0), 
tableName);
+
+      // check whether parameters are in expected state
+      Map<String, String> parameters = hmsTable.getParameters();
+      Assert.assertEquals("0", parameters.get(TableProperties.SNAPSHOT_COUNT));
+      
Assert.assertNull(parameters.get(TableProperties.CURRENT_SNAPSHOT_SUMMARY));
+      Assert.assertNull(parameters.get(TableProperties.CURRENT_SNAPSHOT_ID));
+      
Assert.assertNull(parameters.get(TableProperties.CURRENT_SNAPSHOT_TIMESTAMP));
+
+      // create a snapshot
+      Table icebergTable = catalog.loadTable(tableIdentifier);
+      String fileName = UUID.randomUUID().toString();
+      DataFile file = DataFiles.builder(icebergTable.spec())
+          .withPath(FileFormat.PARQUET.addExtension(fileName))
+          .withRecordCount(2)
+          .withFileSizeInBytes(0)
+          .build();
+      icebergTable.newFastAppend().appendFile(file).commit();
+
+      // check whether parameters are in expected state
+      hmsTable = 
metastoreClient.getTable(tableIdentifier.namespace().level(0), tableName);
+      parameters = hmsTable.getParameters();
+      Assert.assertEquals("1", parameters.get(TableProperties.SNAPSHOT_COUNT));
+      String summary = 
JsonUtil.mapper().writeValueAsString(icebergTable.currentSnapshot().summary());
+      Assert.assertEquals(summary, 
parameters.get(TableProperties.CURRENT_SNAPSHOT_SUMMARY));
+      long snapshotId = icebergTable.currentSnapshot().snapshotId();
+      Assert.assertEquals(String.valueOf(snapshotId), 
parameters.get(TableProperties.CURRENT_SNAPSHOT_ID));
+      
Assert.assertEquals(String.valueOf(icebergTable.currentSnapshot().timestampMillis()),
+          parameters.get(TableProperties.CURRENT_SNAPSHOT_TIMESTAMP));
+
+    } finally {
+      catalog.dropTable(tableIdentifier);
+    }
+  }
+
+  @Test
+  public void testSetSnapshotSummary() throws Exception {
+    Configuration conf = new Configuration();
+    conf.set("iceberg.hive.table.parameter.size.max", "4000");
+    HiveTableOperations spyOps = spy(new HiveTableOperations(conf, null, null, 
catalog.name(), DB_NAME, "tbl"));
+    Snapshot snapshot = mock(Snapshot.class);
+    Map<String, String> summary = Maps.newHashMap();
+    when(snapshot.summary()).thenReturn(summary);
+
+    // create a snapshot summary whose json string size is less than the limit
+    for (int i = 0; i < 100; i++) {
+      summary.put(String.valueOf(i), "value");
+    }
+    Assert.assertTrue(JsonUtil.mapper().writeValueAsString(summary).length() < 
4000);
+    Map<String, String> parameter = Maps.newHashMap();
+    spyOps.setSnapshotSummary(parameter, snapshot);
+    Assert.assertEquals("The snapshot summary must be in parameters", 1, 
parameter.size());
+
+    // create a snapshot summary whose json string size exceeds the limit
+    for (int i = 0; i < 1000; i++) {
+      summary.put(String.valueOf(i), "value");
+    }
+    long summarySize = JsonUtil.mapper().writeValueAsString(summary).length();
+    Assert.assertTrue(summarySize > 4000 && summarySize < 32672);

Review Comment:
   Nit: i think no need for the upper limit check, right?  We are not really 
saving in Hive.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] szehon-ho commented on a diff in pull request #4456: Hive: Expose Snapshot Stats in HMS.

Reply via email to