This is an automated email from the ASF dual-hosted git repository.
ddanielr pushed a commit to branch 2.1
in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/2.1 by this push:
new 92331ea113 Throw error when non-standard chars exist (#4348)
92331ea113 is described below
commit 92331ea113b12c0a833a0a5ea714294832c6d822
Author: Daniel Roberts <[email protected]>
AuthorDate: Tue Mar 12 11:12:53 2024 -0400
Throw error when non-standard chars exist (#4348)
* Throw error when non-standard chars exist
Adds an explicit error message when base64 coding should have been used.
Removes specical handling of `\` character and replaces it with an
allowed character set.
---------
Co-authored-by: Keith Turner <[email protected]>
---
.../accumulo/core/file/rfile/GenerateSplits.java | 32 +++++++++--
.../core/file/rfile/GenerateSplitsTest.java | 64 +++++++++++++++++++---
2 files changed, 82 insertions(+), 14 deletions(-)
diff --git
a/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java
b/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java
index 1812358a59..865210a970 100644
--- a/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java
+++ b/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java
@@ -30,6 +30,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collections;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
@@ -72,6 +73,10 @@ import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
public class GenerateSplits implements KeywordExecutable {
private static final Logger log =
LoggerFactory.getLogger(GenerateSplits.class);
+ private static final Set<Character> allowedChars = new HashSet<>();
+
+ private static final String encodeFlag = "-b64";
+
static class Opts extends ConfigOpts {
@Parameter(names = {"-n", "--num"},
description = "The number of split points to generate. Can be used to
create n+1 tablets. Cannot use with the split size option.")
@@ -81,7 +86,8 @@ public class GenerateSplits implements KeywordExecutable {
description = "The minimum split size in uncompressed bytes. Cannot
use with num splits option.")
public long splitSize = 0;
- @Parameter(names = {"-b64", "--base64encoded"}, description = "Base 64
encode the split points")
+ @Parameter(names = {encodeFlag, "--base64encoded"},
+ description = "Base 64 encode the split points")
public boolean base64encode = false;
@Parameter(names = {"-sf", "--splits-file"}, description = "Output the
splits to a file")
@@ -89,6 +95,7 @@ public class GenerateSplits implements KeywordExecutable {
@Parameter(description = "<file|directory>[ <file|directory>...] -n <num>
| -ss <split_size>")
public List<String> files = new ArrayList<>();
+
}
@Override
@@ -145,6 +152,20 @@ public class GenerateSplits implements KeywordExecutable {
log.trace("Found the following files: {}", filePaths);
}
+ if (!encode) {
+ // Generate the allowed Character set
+ for (int i = 0; i < 10; i++) {
+ // 0-9
+ allowedChars.add((char) (i + 48));
+ }
+ for (int i = 0; i < 26; i++) {
+ // Uppercase A-Z
+ allowedChars.add((char) (i + 65));
+ // Lowercase a-z
+ allowedChars.add((char) (i + 97));
+ }
+ }
+
// if no size specified look at indexed keys first
if (opts.splitSize == 0) {
splits = getIndexKeys(siteConf, hadoopConf, fs, filePaths,
requestedNumSplits, encode,
@@ -256,16 +277,15 @@ public class GenerateSplits implements KeywordExecutable {
if (encode) {
return Base64.getEncoder().encodeToString(bytes);
} else {
- // drop non printable characters
StringBuilder sb = new StringBuilder();
for (byte aByte : bytes) {
int c = 0xff & aByte;
- if (c == '\\') {
- sb.append("\\\\");
- } else if (c >= 32 && c <= 126) {
+ if (allowedChars.contains((char) c)) {
sb.append((char) c);
} else {
- log.debug("Dropping non printable char: \\x{}",
Integer.toHexString(c));
+ // Fail if non-printable characters are detected.
+ throw new UnsupportedOperationException("Non printable char: \\x" +
Integer.toHexString(c)
+ + " detected. Must use Base64 encoded output. The behavior
around non printable chars changed in 2.1.3 to throw an error, the previous
behavior was likely to cause bugs.");
}
}
return sb.toString();
diff --git
a/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java
b/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java
index d2b7a8577e..b0111bbe2a 100644
---
a/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java
+++
b/core/src/test/java/org/apache/accumulo/core/file/rfile/GenerateSplitsTest.java
@@ -33,10 +33,13 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Base64;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
+import java.util.stream.Collectors;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
@@ -91,13 +94,13 @@ public class GenerateSplitsTest {
List<String> args = List.of(rfilePath, "--num", "2", "-sf",
splitsFilePath);
log.info("Invoking GenerateSplits with {}", args);
GenerateSplits.main(args.toArray(new String[0]));
- verifySplitsFile("r3", "r6");
+ verifySplitsFile(false, "r6", "r3");
// test more splits requested than indices
args = List.of(rfilePath, "--num", "4", "-sf", splitsFilePath);
log.info("Invoking GenerateSplits with {}", args);
GenerateSplits.main(args.toArray(new String[0]));
- verifySplitsFile("r2", "r3", "r4", "r5");
+ verifySplitsFile(false, "r2", "r3", "r4", "r5");
}
@Test
@@ -105,15 +108,25 @@ public class GenerateSplitsTest {
List<String> args = List.of(rfilePath, "-ss", "21", "-sf", splitsFilePath);
log.info("Invoking GenerateSplits with {}", args);
GenerateSplits.main(args.toArray(new String[0]));
- verifySplitsFile("r2", "r4", "r6");
+ verifySplitsFile(false, "r2", "r4", "r6");
}
- private void verifySplitsFile(String... splits) throws IOException {
- String splitsFile = Files.readString(Paths.get(splitsFilePath));
- assertEquals(splits.length, splitsFile.split("\n").length);
- for (String s : splits) {
- assertTrue(splitsFile.contains(s), "Did not find " + s + " in: " +
splitsFile);
+ private void verifySplitsFile(boolean encoded, String... splits) throws
IOException {
+ String[] gSplits = Files.readString(Paths.get(splitsFilePath)).split("\n");
+ assertEquals(splits.length, gSplits.length);
+ TreeSet<String> expectedSplits =
+ Arrays.stream(splits).collect(Collectors.toCollection(TreeSet::new));
+ TreeSet<String> generatedSplits = Arrays.stream(gSplits).map(s ->
decode(encoded, s))
+ .map(String::new).collect(Collectors.toCollection(TreeSet::new));
+ assertEquals(expectedSplits, generatedSplits,
+ "Failed to find expected splits in " + generatedSplits);
+ }
+
+ private String decode(boolean decode, String string) {
+ if (decode) {
+ return new String(Base64.getDecoder().decode(string));
}
+ return string;
}
@Test
@@ -156,6 +169,41 @@ public class GenerateSplitsTest {
assertEquals(Set.of("001", "002", "003", "004", "005", "006", "007",
"008", "009"), desired);
}
+ @Test
+ public void testNullValues() throws Exception {
+ trf.openWriter(false);
+ trf.writer.startNewLocalityGroup("lg1", newColFamByteSequence("cf1",
"cf2"));
+ trf.writer.append(newKey("r1\0a", "cf1", "cq1", "L1", 55),
newValue("foo1"));
+ trf.writer.append(newKey("r2\0b", "cf2", "cq1", "L1", 55),
newValue("foo2"));
+ trf.writer.append(newKey("r3\0c", "cf2", "cq1", "L1", 55),
newValue("foo3"));
+ trf.writer.startNewLocalityGroup("lg2", newColFamByteSequence("cf3",
"cf4"));
+ trf.writer.append(newKey("r4\0d", "cf3", "cq1", "L1", 55),
newValue("foo4"));
+ trf.writer.append(newKey("r5\0e", "cf4", "cq1", "L1", 55),
newValue("foo5"));
+ trf.writer.append(newKey("r6\0f", "cf4", "cq1", "L1", 55),
newValue("foo6"));
+ trf.closeWriter();
+
+ File file = new File(tempDir, "testGenerateSplitsWithNulls.rf");
+ assertTrue(file.createNewFile(), "Failed to create file: " + file);
+ try (var fileOutputStream = new FileOutputStream(file)) {
+ fileOutputStream.write(trf.baos.toByteArray());
+ }
+ rfilePath = "file:" + file.getAbsolutePath();
+ log.info("Wrote to file {}", rfilePath);
+
+ File splitsFile = new File(tempDir, "testSplitsFileWithNulls");
+ assertTrue(splitsFile.createNewFile(), "Failed to create file: " +
splitsFile);
+ splitsFilePath = splitsFile.getAbsolutePath();
+
+ List<String> finalArgs = List.of(rfilePath, "--num", "2", "-sf",
splitsFilePath);
+ assertThrows(UnsupportedOperationException.class,
+ () -> GenerateSplits.main(finalArgs.toArray(new String[0])));
+
+ List<String> args = List.of(rfilePath, "--num", "2", "-sf",
splitsFilePath, "-b64");
+ GenerateSplits.main(args.toArray(new String[0]));
+ // Validate that base64 split points are working
+ verifySplitsFile(true, "r6\0f", "r3\0c");
+ }
+
/**
* Create the requested number of splits. Works up to 3 digits or max 999.
*/