[GitHub] [hive] deniskuzZ commented on a change in pull request #2879: [WIP]HIVE-25801: Custom queue setting is not honoured by Query based compaction StatsUpdater

GitBox Tue, 11 Jan 2022 06:17:55 -0800


deniskuzZ commented on a change in pull request #2879:
URL: https://github.com/apache/hive/pull/2879#discussion_r782189125




##########
File path: 
itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java
##########
@@ -335,153 +330,144 @@ public void 
schemaEvolutionAddColDynamicPartitioningUpdate() throws Exception {
   }
 
   /**
-   * After each major compaction, stats need to be updated on each column of 
the
-   * table/partition which previously had stats.
-   * 1. create a bucketed ORC backed table (Orc is currently required by ACID)
-   * 2. populate 2 partitions with data
+   * After each major compaction, stats need to be updated on the table
+   * 1. create a partitioned ORC backed table (Orc is currently required by 
ACID)
+   * 2. populate with data
    * 3. compute stats
-   * 4. insert some data into the table using StreamingAPI
-   * 5. Trigger major compaction (which should update stats)
-   * 6. check that stats have been updated
+   * 4. Trigger major compaction on one of the partitions (which should update 
stats)
+   * 5. check that stats have been updated for that partition only
    *
    * @throws Exception todo:
-   *                   2. add non-partitioned test
    *                   4. add a test with sorted table?
    */
   @Test
   public void testStatsAfterCompactionPartTbl() throws Exception {
     //as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
+    String dbName = "default";
     String tblName = "compaction_test";
-    String tblNameStg = tblName + "_stg";
-    List<String> colNames = Arrays.asList("a", "b");
     executeStatementOnDriver("drop table if exists " + tblName, driver);
-    executeStatementOnDriver("drop table if exists " + tblNameStg, driver);
     executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
       " PARTITIONED BY(bkt INT)" +
       " CLUSTERED BY(a) INTO 4 BUCKETS" + //currently ACID requires table to 
be bucketed
       " STORED AS ORC  TBLPROPERTIES ('transactional'='true')", driver);
-    executeStatementOnDriver("CREATE EXTERNAL TABLE " + tblNameStg + "(a INT, 
b STRING)" +
-      " ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' LINES TERMINATED BY 
'\\n'" +
-      " STORED AS TEXTFILE" +
-      " LOCATION '" + stagingFolder.newFolder().toURI().getPath() + "'", 
driver);
-
-    executeStatementOnDriver("load data local inpath '" + BASIC_FILE_NAME +
-      "' overwrite into table " + tblNameStg, driver);
-    execSelectAndDumpData("select * from " + tblNameStg, driver, "Dumping data 
for " +
-      tblNameStg + " after load:");
-    executeStatementOnDriver("FROM " + tblNameStg +
-      " INSERT INTO TABLE " + tblName + " PARTITION(bkt=0) " +
-      "SELECT a, b where a < 2", driver);
-    executeStatementOnDriver("FROM " + tblNameStg +
-      " INSERT INTO TABLE " + tblName + " PARTITION(bkt=1) " +
-      "SELECT a, b where a >= 2", driver);
+    executeStatementOnDriver("INSERT INTO TABLE " + tblName + " 
PARTITION(bkt=0)" +
+      " values(55, 'London')", driver);
+    executeStatementOnDriver("INSERT INTO TABLE " + tblName + " 
PARTITION(bkt=0)" +
+      " values(56, 'Paris')", driver);
+    executeStatementOnDriver("INSERT INTO TABLE " + tblName + " 
PARTITION(bkt=1)" +
+      " values(57, 'Budapest')", driver);
+    executeStatementOnDriver("INSERT INTO TABLE " + tblName + " 
PARTITION(bkt=1)" +
+            " values(58, 'Milano')", driver);
     execSelectAndDumpData("select * from " + tblName, driver, "Dumping data 
for " +
       tblName + " after load:");
 
     TxnStore txnHandler = TxnUtils.getTxnStore(conf);
-    CompactionInfo ci = new CompactionInfo("default", tblName, "bkt=0", 
CompactionType.MAJOR);
-    Table table = msClient.getTable("default", tblName);
-    LOG.debug("List of stats columns before analyze Part1: " + 
txnHandler.findColumnsWithStats(ci));
-    Worker.StatsUpdater su = Worker.StatsUpdater.init(ci, colNames, conf,
-      System.getProperty("user.name"), 
CompactorUtil.getCompactorJobQueueName(conf, ci, table));
-    su.gatherStats();//compute stats before compaction
-    LOG.debug("List of stats columns after analyze Part1: " + 
txnHandler.findColumnsWithStats(ci));
-
-    CompactionInfo ciPart2 = new CompactionInfo("default", tblName, "bkt=1", 
CompactionType.MAJOR);
-    LOG.debug("List of stats columns before analyze Part2: " + 
txnHandler.findColumnsWithStats(ci));
-    su = Worker.StatsUpdater.init(ciPart2, colNames, conf, 
System.getProperty("user.name"),
-        CompactorUtil.getCompactorJobQueueName(conf, ciPart2, table));
-    su.gatherStats();//compute stats before compaction
-    LOG.debug("List of stats columns after analyze Part2: " + 
txnHandler.findColumnsWithStats(ci));
-
-    //now make sure we get the stats we expect for partition we are going to 
add data to later
-    Map<String, List<ColumnStatisticsObj>> stats = 
msClient.getPartitionColumnStatistics(ci.dbname,
-      ci.tableName, Arrays.asList(ci.partName), colNames, 
Constants.HIVE_ENGINE);
-    List<ColumnStatisticsObj> colStats = stats.get(ci.partName);
-    assertNotNull("No stats found for partition " + ci.partName, colStats);
-    Assert.assertEquals("Expected column 'a' at index 0", "a", 
colStats.get(0).getColName());
-    Assert.assertEquals("Expected column 'b' at index 1", "b", 
colStats.get(1).getColName());
-    LongColumnStatsData colAStats = 
colStats.get(0).getStatsData().getLongStats();
-    Assert.assertEquals("lowValue a", 1, colAStats.getLowValue());
-    Assert.assertEquals("highValue a", 1, colAStats.getHighValue());
-    Assert.assertEquals("numNulls a", 0, colAStats.getNumNulls());
-    Assert.assertEquals("numNdv a", 1, colAStats.getNumDVs());
-    StringColumnStatsData colBStats = 
colStats.get(1).getStatsData().getStringStats();
-    Assert.assertEquals("maxColLen b", 3, colBStats.getMaxColLen());
-    Assert.assertEquals("avgColLen b", 3.0, colBStats.getAvgColLen(), 0.01);
-    Assert.assertEquals("numNulls b", 0, colBStats.getNumNulls());
-    Assert.assertEquals("nunDVs", 3, colBStats.getNumDVs());
-
-    //now save stats for partition we won't modify
-    stats = msClient.getPartitionColumnStatistics(ciPart2.dbname,
-      ciPart2.tableName, Arrays.asList(ciPart2.partName), colNames, 
Constants.HIVE_ENGINE);
-    colStats = stats.get(ciPart2.partName);
-    LongColumnStatsData colAStatsPart2 = 
colStats.get(0).getStatsData().getLongStats();
-    StringColumnStatsData colBStatsPart2 = 
colStats.get(1).getStatsData().getStringStats();
+    Table table = msClient.getTable(dbName, tblName);
+
+    //compute stats before compaction
+    CompactionInfo ci = new CompactionInfo(dbName, tblName, "bkt=0", 
CompactionType.MAJOR);
+    Worker.StatsUpdater.gatherStats(ci, conf,
+            System.getProperty("user.name"), 
CompactorUtil.getCompactorJobQueueName(conf, ci, table));
+    ci = new CompactionInfo(dbName, tblName, "bkt=1", CompactionType.MAJOR);
+    Worker.StatsUpdater.gatherStats(ci, conf,
+            System.getProperty("user.name"), 
CompactorUtil.getCompactorJobQueueName(conf, ci, table));
+
+    //Check basic stats are collected
+    org.apache.hadoop.hive.ql.metadata.Table hiveTable = 
Hive.get().getTable(tblName);
+    Map<String, String> parameters = 
Hive.get().getPartitions(hiveTable).get(0).getParameters();

Review comment:
       should we check both partitions as you are loading data into both?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [hive] deniskuzZ commented on a change in pull request #2879: [WIP]HIVE-25801: Custom queue setting is not honoured by Query based compaction StatsUpdater

Reply via email to