[
https://issues.apache.org/jira/browse/HIVE-25441?focusedWorklogId=636952&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-636952
]
ASF GitHub Bot logged work on HIVE-25441:
-----------------------------------------
Author: ASF GitHub Bot
Created on: 11/Aug/21 15:50
Start Date: 11/Aug/21 15:50
Worklog Time Spent: 10m
Work Description: deniskuzZ commented on a change in pull request #2579:
URL: https://github.com/apache/hive/pull/2579#discussion_r686960430
##########
File path:
itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/txn/compactor/TestCompactor.java
##########
@@ -781,6 +782,67 @@ public void
autoCompactOnStreamingIngestWithDynamicPartition() throws Exception
}
}
+ @Test
+ public void testNoDataLossWhenMaxNumDeltaIsUsed() throws Exception {
+ String dbName = "default";
+ String tblName = "cws";
+ executeStatementOnDriver("drop table if exists " + tblName, driver);
+
+ executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
+ " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
+ executeStatementOnDriver("insert into " + tblName + " values (1, 'a')",
driver);
+ executeStatementOnDriver("insert into " + tblName + " values (3, 'b')",
driver);
+
+ runMajorCompaction(dbName, tblName);
+ runCleaner(conf);
+
+ for (int i = 0; i < 3; i++) {
+ executeStatementOnDriver("MERGE INTO " + tblName + " AS T USING (" +
+ "select * from " + tblName + " union all select a+1, b from " +
tblName + ") AS S " +
+ "ON T.a=s.a " +
+ "WHEN MATCHED THEN DELETE " +
+ "WHEN not MATCHED THEN INSERT values (s.a, s.b)", driver);
+ }
+
+ driver.run("select a from " + tblName);
+ List<String> res = new ArrayList<>();
+ driver.getFetchTask().fetch(res);
+ Assert.assertEquals(res, Arrays.asList("4", "6"));
+
+ conf.setIntVar(HiveConf.ConfVars.COMPACTOR_MAX_NUM_DELTA, 5);
+ runMajorCompaction(dbName, tblName);
+
+ List<String> matchesNotFound = new ArrayList<>(5);
+ matchesNotFound.add(AcidUtils.deleteDeltaSubdir(3, 4) +
VISIBILITY_PATTERN);
+ matchesNotFound.add(AcidUtils.deltaSubdir(3, 4) + VISIBILITY_PATTERN);
+ matchesNotFound.add(AcidUtils.deleteDeltaSubdir(5, 5, 0));
+ matchesNotFound.add(AcidUtils.deltaSubdir(5, 5, 1));
+ matchesNotFound.add(AcidUtils.baseDir(5) + VISIBILITY_PATTERN);
+
+ IMetaStoreClient msClient = new HiveMetaStoreClient(conf);
+ Table table = msClient.getTable(dbName, tblName);
+ msClient.close();
+
+ FileSystem fs = FileSystem.get(conf);
+ FileStatus[] stat = fs.listStatus(new Path(table.getSd().getLocation()));
+
+ for (FileStatus f : stat) {
+ for (int j = 0; j < matchesNotFound.size(); j++) {
+ if (f.getPath().getName().matches(matchesNotFound.get(j))) {
+ matchesNotFound.remove(j);
+ break;
+ }
+ }
+ }
+ Assert.assertEquals("Matches Not Found: " + matchesNotFound.toArray(), 0,
matchesNotFound.size());
Review comment:
fixed
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Issue Time Tracking
-------------------
Worklog Id: (was: 636952)
Time Spent: 50m (was: 40m)
> Incorrect deltas split for sub-compactions when using
> `hive.compactor.max.num.delta`
> ------------------------------------------------------------------------------------
>
> Key: HIVE-25441
> URL: https://issues.apache.org/jira/browse/HIVE-25441
> Project: Hive
> Issue Type: Task
> Reporter: Denys Kuzmenko
> Priority: Major
> Labels: pull-request-available
> Time Spent: 50m
> Remaining Estimate: 0h
>
> {code}
> #Repro steps:
> #1./ set hive.compactor.max.num.delta to 5 on HMS
> #2./ Set up the table
> set hive.merge.cardinality.check=false;
> create table test (k int);
> ALTER TABLE test SET TBLPROPERTIES ('NO_AUTO_COMPACTION'='true');
> insert into test values (1);
> alter table test compact 'major' and wait;
> dfs -ls '/warehouse/tablespace/managed/hive/test';
> # drwxrwx---+ - hive hive 0 2021-08-09 12:26
> /warehouse/tablespace/managed/hive/test/base_0000008_v0000416
> select * from test;
> # k=1
> #run 3 times so there's enough delta dirs, ie. 6 (should just increase k by 1)
> #basically just removes the row and adds a new row with k+1 value
> MERGE INTO test AS T USING (select * from test union all select k+1 from
> test) AS S
> ON T.k=s.k
> WHEN MATCHED THEN DELETE
> WHEN not MATCHED THEN INSERT values (s.k);
> select * from test;
> #k=4
> dfs -ls '/warehouse/tablespace/managed/hive/test';
> #drwxrwx---+ - hive hive 0 2021-08-09 12:26
> /warehouse/tablespace/managed/hive/test/base_0000008_v0000416
> #drwxrwx---+ - hive hive 0 2021-08-09 12:28
> /warehouse/tablespace/managed/hive/test/delete_delta_0000009_0000009_0001
> #drwxrwx---+ - hive hive 0 2021-08-09 12:29
> /warehouse/tablespace/managed/hive/test/delete_delta_0000010_0000010_0001
> #drwxrwx---+ - hive hive 0 2021-08-09 12:29
> /warehouse/tablespace/managed/hive/test/delete_delta_0000011_0000011_0001
> #drwxrwx---+ - hive hive 0 2021-08-09 12:28
> /warehouse/tablespace/managed/hive/test/delta_0000009_0000009_0003
> #drwxrwx---+ - hive hive 0 2021-08-09 12:29
> /warehouse/tablespace/managed/hive/test/delta_0000010_0000010_0003
> #drwxrwx---+ - hive hive 0 2021-08-09 12:29
> /warehouse/tablespace/managed/hive/test/delta_0000011_0000011_0003
> alter table test compact 'major' and wait;
> select * from test;
> #result is empty
> dfs -ls '/warehouse/tablespace/managed/hive/test';
> #2 drwxrwx---+ - hive hive 0 2021-08-09 12:31
> /warehouse/tablespace/managed/hive/test/base_0000011_v0000428
> {code}
> Some logs from the above example:
> {code}
> 2021-08-09 12:30:37,532 WARN
> org.apache.hadoop.hive.ql.txn.compactor.CompactorMR:
> [nightly-7x-us-2-2.nightly-7x-us-2.root.hwx.site-49_executor]: 6 delta files
> found for default.test located at
> hdfs://nightly-7x-us-2-2.nightly-7x-us-2.root.hwx.site:8020/warehouse/tablespace/managed/hive/test!
> This is likely a sign of misconfiguration, especially if this message
> repeats. Check that compaction is running properly. Check for any
> runaway/mis-configured process writing to ACID tables, especially using
> Streaming Ingest API.
> 2021-08-09 12:30:37,533 INFO
> org.apache.hadoop.hive.ql.txn.compactor.CompactorMR:
> [nightly-7x-us-2-2.nightly-7x-us-2.root.hwx.site-49_executor]: Submitting
> MINOR compaction job
> 'nightly-7x-us-2-2.nightly-7x-us-2.root.hwx.site-49-compactor-default.test_0'
> to default queue. (current delta dirs count=5, obsolete delta dirs count=-1.
> TxnIdRange[9,11]
> 2021-08-09 12:30:38,003 INFO
> org.apache.hadoop.hive.ql.txn.compactor.CompactorMR:
> [nightly-7x-us-2-2.nightly-7x-us-2.root.hwx.site-49_executor]: Submitted
> compaction job
> 'nightly-7x-us-2-2.nightly-7x-us-2.root.hwx.site-49-compactor-default.test_0'
> with jobID=job_1628497133224_0051 compaction ID=23
> #From app logs of the minor compaction, note that delta_0000011_0000011_0001
> is missing from the list
> 2021-08-09 12:30:47,399 INFO [main] org.apache.hadoop.mapred.MapTask:
> Processing split: CompactorInputSplit{base: null, bucket: 0, length: 3231,
> deltas: [delete_delta_0000009_0000009_0001, delta_0000009_0000009_0003,
> delete_delta_0000010_0000010_0001, delta_0000010_0000010_0003,
> delete_delta_0000011_0000011_0001]}
> 2021-08-09 12:30:53,061 INFO
> org.apache.hadoop.hive.ql.txn.compactor.CompactorMR:
> [nightly-7x-us-2-2.nightly-7x-us-2.root.hwx.site-49_executor]: Submitting
> MAJOR compaction job
> 'nightly-7x-us-2-2.nightly-7x-us-2.root.hwx.site-49-compactor-default.test'
> to default queue. (current delta dirs count=2, obsolete delta dirs count=6.
> TxnIdRange[9,11]
> 2021-08-09 12:30:53,501 INFO
> org.apache.hadoop.hive.ql.txn.compactor.CompactorMR:
> [nightly-7x-us-2-2.nightly-7x-us-2.root.hwx.site-49_executor]: Submitted
> compaction job
> 'nightly-7x-us-2-2.nightly-7x-us-2.root.hwx.site-49-compactor-default.test'
> with jobID=job_1628497133224_0052 compaction ID=23
> 2021-08-09 12:31:03,493 INFO [main] org.apache.hadoop.mapred.MapTask:
> Processing split: CompactorInputSplit{base:
> hdfs://nightly-7x-us-2-2.nightly-7x-us-2.root.hwx.site:8020/warehouse/tablespace/managed/hive/test/base_0000008_v0000416,
> bucket: 0, length: 1697, deltas: [delete_delta_0000009_0000011_v0000428,
> delta_0000009_0000011_v0000428]}
> {code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)