[
https://issues.apache.org/jira/browse/HIVE-25801?focusedWorklogId=706875&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-706875
]
ASF GitHub Bot logged work on HIVE-25801:
-----------------------------------------
Author: ASF GitHub Bot
Created on: 11/Jan/22 14:17
Start Date: 11/Jan/22 14:17
Worklog Time Spent: 10m
Work Description: deniskuzZ commented on a change in pull request #2879:
URL: https://github.com/apache/hive/pull/2879#discussion_r782189125
##########
File path:
itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java
##########
@@ -335,153 +330,144 @@ public void
schemaEvolutionAddColDynamicPartitioningUpdate() throws Exception {
}
/**
- * After each major compaction, stats need to be updated on each column of
the
- * table/partition which previously had stats.
- * 1. create a bucketed ORC backed table (Orc is currently required by ACID)
- * 2. populate 2 partitions with data
+ * After each major compaction, stats need to be updated on the table
+ * 1. create a partitioned ORC backed table (Orc is currently required by
ACID)
+ * 2. populate with data
* 3. compute stats
- * 4. insert some data into the table using StreamingAPI
- * 5. Trigger major compaction (which should update stats)
- * 6. check that stats have been updated
+ * 4. Trigger major compaction on one of the partitions (which should update
stats)
+ * 5. check that stats have been updated for that partition only
*
* @throws Exception todo:
- * 2. add non-partitioned test
* 4. add a test with sorted table?
*/
@Test
public void testStatsAfterCompactionPartTbl() throws Exception {
//as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
+ String dbName = "default";
String tblName = "compaction_test";
- String tblNameStg = tblName + "_stg";
- List<String> colNames = Arrays.asList("a", "b");
executeStatementOnDriver("drop table if exists " + tblName, driver);
- executeStatementOnDriver("drop table if exists " + tblNameStg, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
" PARTITIONED BY(bkt INT)" +
" CLUSTERED BY(a) INTO 4 BUCKETS" + //currently ACID requires table to
be bucketed
" STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
- executeStatementOnDriver("CREATE EXTERNAL TABLE " + tblNameStg + "(a INT,
b STRING)" +
- " ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' LINES TERMINATED BY
'\\n'" +
- " STORED AS TEXTFILE" +
- " LOCATION '" + stagingFolder.newFolder().toURI().getPath() + "'",
driver);
-
- executeStatementOnDriver("load data local inpath '" + BASIC_FILE_NAME +
- "' overwrite into table " + tblNameStg, driver);
- execSelectAndDumpData("select * from " + tblNameStg, driver, "Dumping data
for " +
- tblNameStg + " after load:");
- executeStatementOnDriver("FROM " + tblNameStg +
- " INSERT INTO TABLE " + tblName + " PARTITION(bkt=0) " +
- "SELECT a, b where a < 2", driver);
- executeStatementOnDriver("FROM " + tblNameStg +
- " INSERT INTO TABLE " + tblName + " PARTITION(bkt=1) " +
- "SELECT a, b where a >= 2", driver);
+ executeStatementOnDriver("INSERT INTO TABLE " + tblName + "
PARTITION(bkt=0)" +
+ " values(55, 'London')", driver);
+ executeStatementOnDriver("INSERT INTO TABLE " + tblName + "
PARTITION(bkt=0)" +
+ " values(56, 'Paris')", driver);
+ executeStatementOnDriver("INSERT INTO TABLE " + tblName + "
PARTITION(bkt=1)" +
+ " values(57, 'Budapest')", driver);
+ executeStatementOnDriver("INSERT INTO TABLE " + tblName + "
PARTITION(bkt=1)" +
+ " values(58, 'Milano')", driver);
execSelectAndDumpData("select * from " + tblName, driver, "Dumping data
for " +
tblName + " after load:");
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
- CompactionInfo ci = new CompactionInfo("default", tblName, "bkt=0",
CompactionType.MAJOR);
- Table table = msClient.getTable("default", tblName);
- LOG.debug("List of stats columns before analyze Part1: " +
txnHandler.findColumnsWithStats(ci));
- Worker.StatsUpdater su = Worker.StatsUpdater.init(ci, colNames, conf,
- System.getProperty("user.name"),
CompactorUtil.getCompactorJobQueueName(conf, ci, table));
- su.gatherStats();//compute stats before compaction
- LOG.debug("List of stats columns after analyze Part1: " +
txnHandler.findColumnsWithStats(ci));
-
- CompactionInfo ciPart2 = new CompactionInfo("default", tblName, "bkt=1",
CompactionType.MAJOR);
- LOG.debug("List of stats columns before analyze Part2: " +
txnHandler.findColumnsWithStats(ci));
- su = Worker.StatsUpdater.init(ciPart2, colNames, conf,
System.getProperty("user.name"),
- CompactorUtil.getCompactorJobQueueName(conf, ciPart2, table));
- su.gatherStats();//compute stats before compaction
- LOG.debug("List of stats columns after analyze Part2: " +
txnHandler.findColumnsWithStats(ci));
-
- //now make sure we get the stats we expect for partition we are going to
add data to later
- Map<String, List<ColumnStatisticsObj>> stats =
msClient.getPartitionColumnStatistics(ci.dbname,
- ci.tableName, Arrays.asList(ci.partName), colNames,
Constants.HIVE_ENGINE);
- List<ColumnStatisticsObj> colStats = stats.get(ci.partName);
- assertNotNull("No stats found for partition " + ci.partName, colStats);
- Assert.assertEquals("Expected column 'a' at index 0", "a",
colStats.get(0).getColName());
- Assert.assertEquals("Expected column 'b' at index 1", "b",
colStats.get(1).getColName());
- LongColumnStatsData colAStats =
colStats.get(0).getStatsData().getLongStats();
- Assert.assertEquals("lowValue a", 1, colAStats.getLowValue());
- Assert.assertEquals("highValue a", 1, colAStats.getHighValue());
- Assert.assertEquals("numNulls a", 0, colAStats.getNumNulls());
- Assert.assertEquals("numNdv a", 1, colAStats.getNumDVs());
- StringColumnStatsData colBStats =
colStats.get(1).getStatsData().getStringStats();
- Assert.assertEquals("maxColLen b", 3, colBStats.getMaxColLen());
- Assert.assertEquals("avgColLen b", 3.0, colBStats.getAvgColLen(), 0.01);
- Assert.assertEquals("numNulls b", 0, colBStats.getNumNulls());
- Assert.assertEquals("nunDVs", 3, colBStats.getNumDVs());
-
- //now save stats for partition we won't modify
- stats = msClient.getPartitionColumnStatistics(ciPart2.dbname,
- ciPart2.tableName, Arrays.asList(ciPart2.partName), colNames,
Constants.HIVE_ENGINE);
- colStats = stats.get(ciPart2.partName);
- LongColumnStatsData colAStatsPart2 =
colStats.get(0).getStatsData().getLongStats();
- StringColumnStatsData colBStatsPart2 =
colStats.get(1).getStatsData().getStringStats();
+ Table table = msClient.getTable(dbName, tblName);
+
+ //compute stats before compaction
+ CompactionInfo ci = new CompactionInfo(dbName, tblName, "bkt=0",
CompactionType.MAJOR);
+ Worker.StatsUpdater.gatherStats(ci, conf,
+ System.getProperty("user.name"),
CompactorUtil.getCompactorJobQueueName(conf, ci, table));
+ ci = new CompactionInfo(dbName, tblName, "bkt=1", CompactionType.MAJOR);
+ Worker.StatsUpdater.gatherStats(ci, conf,
+ System.getProperty("user.name"),
CompactorUtil.getCompactorJobQueueName(conf, ci, table));
+
+ //Check basic stats are collected
+ org.apache.hadoop.hive.ql.metadata.Table hiveTable =
Hive.get().getTable(tblName);
+ Map<String, String> parameters =
Hive.get().getPartitions(hiveTable).get(0).getParameters();
Review comment:
should we check both partitions as you are loading data into both?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Issue Time Tracking
-------------------
Worklog Id: (was: 706875)
Time Spent: 2h 40m (was: 2.5h)
> Custom queue settings is not honoured by Query based compaction StatsUpdater
> ----------------------------------------------------------------------------
>
> Key: HIVE-25801
> URL: https://issues.apache.org/jira/browse/HIVE-25801
> Project: Hive
> Issue Type: Bug
> Reporter: László Végh
> Assignee: László Végh
> Priority: Major
> Labels: pull-request-available
> Time Spent: 2h 40m
> Remaining Estimate: 0h
>
> {{hive.compactor.job.queue}} config limits resources available for
> compaction, so users can limit the effects of compaction on the cluster.
> However this settings does not affect stats collection which uses Driver.
> HIVE-25595 is addressing the above issue for MR-based compaction. We need to
> incorporate the same thing for the Query-based compaction.
--
This message was sent by Atlassian Jira
(v8.20.1#820001)