milleruntime commented on code in PR #2792:
URL: https://github.com/apache/accumulo/pull/2792#discussion_r956139980
##########
server/gc/src/test/java/org/apache/accumulo/gc/GarbageCollectionTest.java:
##########
@@ -778,4 +838,152 @@ public void bulkImportReplicationRecordsPreventDeletion()
throws Exception {
assertEquals(1, gce.deletes.size());
assertEquals("hdfs://foo.com:6000/accumulo/tables/2/t-00002/A000002.rf",
gce.deletes.get(0));
}
+
+ @Test
+ public void testMissingTableIds() throws Exception {
Review Comment:
```suggestion
public void testMissingTableIds() {
```
##########
server/gc/src/test/java/org/apache/accumulo/gc/GarbageCollectionTest.java:
##########
@@ -778,4 +838,152 @@ public void bulkImportReplicationRecordsPreventDeletion()
throws Exception {
assertEquals(1, gce.deletes.size());
assertEquals("hdfs://foo.com:6000/accumulo/tables/2/t-00002/A000002.rf",
gce.deletes.get(0));
}
+
+ @Test
+ public void testMissingTableIds() throws Exception {
+ GarbageCollectionAlgorithm gca = new GarbageCollectionAlgorithm();
+
+ TestGCE gce = new TestGCE();
+
+ gce.candidates.add("hdfs://foo.com:6000/user/foo/tables/a/t-0/F00.rf");
+
+ gce.addFileReference("a", null,
"hdfs://foo.com:6000/user/foo/tables/a/t-0/F00.rf");
+ gce.addFileReference("c", null,
"hdfs://foo.com:6000/user/foo/tables/c/t-0/F00.rf");
+
+ // add 2 more table references that will not be seen by in the scan
+ gce.tableIds.addAll(Arrays.asList(TableId.of("b"), TableId.of("d")));
+
+ String msg = assertThrows(RuntimeException.class, () ->
gca.collect(gce)).getMessage();
+ assertTrue((msg.contains("[b, d]") || msg.contains("[d, b]"))
+ && msg.contains("Saw table IDs in ZK that were not in metadata
table:"), msg);
+ }
+
+ // below are tests for potential failure conditions of the GC process. Some
of these cases were
+ // observed on clusters. Some were hypothesis based on observations. The
result was that
+ // candidate entries were not removed when they should have been and
therefore files were
+ // removed from HDFS that were actually still in use
+
+ private Set<TableId> makeUnmodifiableSet(String... args) {
+ Set<TableId> t = new HashSet<>();
+ for (String arg : args) {
+ t.add(TableId.of(arg));
+ }
+ return Collections.unmodifiableSet(t);
+ }
+
+ @Test
+ public void testNormalGCRun() {
+ // happy path, no tables added or removed during this portion and all the
tables checked
+ Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3");
+ Set<TableId> tablesSeen = makeUnmodifiableSet("2", "1", "3");
+ Set<TableId> tablesAfter = makeUnmodifiableSet("1", "3", "2");
+
+ new GarbageCollectionAlgorithm().ensureAllTablesChecked(tablesBefore,
tablesSeen, tablesAfter);
+ }
+
+ @Test
+ public void testTableAddedInMiddle() {
+ // table was added during this portion and we don't see it, should be fine
+ Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3");
+ Set<TableId> tablesSeen = makeUnmodifiableSet("2", "1", "3");
+ Set<TableId> tablesAfter = makeUnmodifiableSet("1", "3", "2", "4");
+
+ new GarbageCollectionAlgorithm().ensureAllTablesChecked(tablesBefore,
tablesSeen, tablesAfter);
+ }
+
+ @Test
+ public void testTableAddedInMiddleTwo() {
+ // table was added during this portion and we DO see it
+ // Means table was added after candidates were grabbed, so there should be
nothing to remove
+ Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3");
+ Set<TableId> tablesSeen = makeUnmodifiableSet("2", "1", "3", "4");
+ Set<TableId> tablesAfter = makeUnmodifiableSet("1", "3", "2", "4");
+
+ new GarbageCollectionAlgorithm().ensureAllTablesChecked(tablesBefore,
tablesSeen, tablesAfter);
+ }
+
+ @Test
+ public void testTableDeletedInMiddle() {
+ // table was deleted during this portion and we don't see it
+ // this mean any candidates from the deleted table wil stay on the
candidate list
+ // and during the delete step they will try to removed
+ Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3", "4");
+ Set<TableId> tablesSeen = makeUnmodifiableSet("2", "1", "4");
+ Set<TableId> tablesAfter = makeUnmodifiableSet("1", "2", "4");
+
+ new GarbageCollectionAlgorithm().ensureAllTablesChecked(tablesBefore,
tablesSeen, tablesAfter);
+ }
+
+ @Test
+ public void testTableDeletedInMiddleTwo() {
+ // table was deleted during this portion and we DO see it
+ // this mean candidates from the deleted table may get removed from the
candidate list
+ // which should be ok, as the delete table function should be responsible
for removing those
+ Set<TableId> tablesBefore = makeUnmodifiableSet("1", "2", "3", "4");
+ Set<TableId> tablesSeen = makeUnmodifiableSet("2", "1", "4", "3");
+ Set<TableId> tablesAfter = makeUnmodifiableSet("1", "2", "4");
+
+ new GarbageCollectionAlgorithm().ensureAllTablesChecked(tablesBefore,
tablesSeen, tablesAfter);
+ }
+
+ @Test
+ public void testMissEntireTable() {
+ // this test simulates missing an entire table when looking for what files
are in use
+ // if you add custom splits to the metadata at able boundaries, this can
happen with a failed
+ // scan
+ // recall the ~tab:~pr for this first entry of a new table is empty, so
there is now way to
+ // check the prior row. If you split a couple of tables in the metadata
the table boundary
+ // , say table ids 2,3,4, and then miss scanning table 3 but get 4, it is
possible other
+ // consistency checks will miss this
Review Comment:
Each of these comments describing what the tests do could be put in the
javadoc of the test. I think this would help fix the formatting as well.
##########
server/gc/src/main/java/org/apache/accumulo/gc/GarbageCollectionAlgorithm.java:
##########
@@ -216,6 +225,46 @@ private long
removeBlipCandidates(GarbageCollectionEnvironment gce,
return blipCount;
}
+ @VisibleForTesting
+ /**
+ *
+ */
Review Comment:
Should put something here...
```suggestion
/**
* Double check no tables were missed during GC.
*/
```
##########
server/gc/src/test/java/org/apache/accumulo/gc/GarbageCollectionTest.java:
##########
@@ -778,4 +838,152 @@ public void bulkImportReplicationRecordsPreventDeletion()
throws Exception {
assertEquals(1, gce.deletes.size());
assertEquals("hdfs://foo.com:6000/accumulo/tables/2/t-00002/A000002.rf",
gce.deletes.get(0));
}
+
+ @Test
+ public void testMissingTableIds() throws Exception {
+ GarbageCollectionAlgorithm gca = new GarbageCollectionAlgorithm();
+
+ TestGCE gce = new TestGCE();
+
+ gce.candidates.add("hdfs://foo.com:6000/user/foo/tables/a/t-0/F00.rf");
+
+ gce.addFileReference("a", null,
"hdfs://foo.com:6000/user/foo/tables/a/t-0/F00.rf");
+ gce.addFileReference("c", null,
"hdfs://foo.com:6000/user/foo/tables/c/t-0/F00.rf");
+
+ // add 2 more table references that will not be seen by in the scan
Review Comment:
```suggestion
// add 2 more table references that will not be seen by the scan
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]