Repository: avro Updated Branches: refs/heads/master d9338a4cf -> f8c70a3a9
AVRO-1787 Add support of directories & globs to concat and cat. Contributed by Clément MATHIEU. Signed-off-by: Sean Busbey <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/avro/repo Commit: http://git-wip-us.apache.org/repos/asf/avro/commit/f8c70a3a Tree: http://git-wip-us.apache.org/repos/asf/avro/tree/f8c70a3a Diff: http://git-wip-us.apache.org/repos/asf/avro/diff/f8c70a3a Branch: refs/heads/master Commit: f8c70a3a9fe5a410363544493a03dc2d10340cbd Parents: d9338a4 Author: Clément MATHIEU <[email protected]> Authored: Tue Jan 24 11:09:03 2017 -0600 Committer: Sean Busbey <[email protected]> Committed: Fri Jan 27 09:59:21 2017 -0600 ---------------------------------------------------------------------- .../java/org/apache/avro/tool/ConcatTool.java | 28 ++++++- .../main/java/org/apache/avro/tool/Util.java | 39 ++++++---- .../java/org/apache/avro/tool/TestCatTool.java | 14 ++++ .../org/apache/avro/tool/TestConcatTool.java | 79 ++++++++++++++++++++ 4 files changed, 143 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/avro/blob/f8c70a3a/lang/java/tools/src/main/java/org/apache/avro/tool/ConcatTool.java ---------------------------------------------------------------------- diff --git a/lang/java/tools/src/main/java/org/apache/avro/tool/ConcatTool.java b/lang/java/tools/src/main/java/org/apache/avro/tool/ConcatTool.java index 6026796..e782321 100644 --- a/lang/java/tools/src/main/java/org/apache/avro/tool/ConcatTool.java +++ b/lang/java/tools/src/main/java/org/apache/avro/tool/ConcatTool.java @@ -17,9 +17,11 @@ */ package org.apache.avro.tool; +import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.PrintStream; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; @@ -33,6 +35,7 @@ import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.Path; /** * Tool to concatenate avro files with the same schema and non-reserved @@ -65,7 +68,7 @@ public class ConcatTool implements Tool { Map<String, byte[]> metadata = new TreeMap<String, byte[]>(); String inputCodec = null; - for (String inFile : args) { + for (String inFile : expandsInputFiles(args)) { InputStream input = Util.fileOrStdin(inFile, in); DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>( input, new GenericDatumReader<GenericRecord>()); @@ -124,6 +127,24 @@ public class ConcatTool implements Tool { return 0; } + /** Processes a list of input files to expand directories if needed. */ + private static List<String> expandsInputFiles(List<String> args) throws IOException { + List<String> files = new ArrayList<String>(); + + for (String arg : args) { + if (arg.equals("-")) { + files.add(arg); + } else { + List<Path> paths = Util.getFiles(arg); + for (Path path : paths) { + files.add(path.toString()); + } + } + } + + return files; + } + private void printHelp(PrintStream out) { out.println("concat [input-file...] output-file"); out.println(); @@ -136,8 +157,9 @@ public class ConcatTool implements Tool { out.println(" 3 if the codecs don't match"); out.println("If no input files are given stdin will be used. The tool"); out.println("0 on success. A dash ('-') can be given as an input file"); - out.println("to use stdin, and as an output file to use stdout."); - + out.println("to use stdin, and as an output file to use stdout. If a directory"); + out.println("is given as an input-file all the files within this directory"); + out.println("are used."); } @Override http://git-wip-us.apache.org/repos/asf/avro/blob/f8c70a3a/lang/java/tools/src/main/java/org/apache/avro/tool/Util.java ---------------------------------------------------------------------- diff --git a/lang/java/tools/src/main/java/org/apache/avro/tool/Util.java b/lang/java/tools/src/main/java/org/apache/avro/tool/Util.java index 708bb41..9f1cae1 100644 --- a/lang/java/tools/src/main/java/org/apache/avro/tool/Util.java +++ b/lang/java/tools/src/main/java/org/apache/avro/tool/Util.java @@ -22,6 +22,7 @@ import static org.apache.avro.file.DataFileConstants.DEFLATE_CODEC; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -168,42 +169,52 @@ class Util { } } - /**If pathname is a file, this method returns a list with a single absolute Path to that file, - * if pathname is a directory, this method returns a list of Pathes to all the files within - * this directory. - * Only files inside that directory are included, no subdirectories or files in subdirectories - * will be added. + /** + * If pathname is a file, this method returns a list with a single absolute Path to that file. + * If pathname is a directory, this method returns a list of Pathes to all the files within + * this directory. Only files inside that directory are included, no subdirectories or files + * in subdirectories will be added. + * If pathname is a glob pattern, all files matching the pattern are included. + * * The List is sorted alphabetically. - * @param fileOrDirName filename or directoryname + * @param fileOrDirName filename, directoryname or a glob pattern * @return A Path List * @throws IOException */ - static List<Path> getFiles(String fileOrDirName) - throws IOException { + static List<Path> getFiles(String fileOrDirName) throws IOException { List<Path> pathList = new ArrayList<Path>(); Path path = new Path(fileOrDirName); FileSystem fs = path.getFileSystem(new Configuration()); if (fs.isFile(path)) { pathList.add(path); - } - else if (fs.getFileStatus(path).isDir()) { + } else if (fs.isDirectory(path)) { for (FileStatus status : fs.listStatus(path)) { if(!status.isDir()) { pathList.add(status.getPath()); } } + } else { + FileStatus[] fileStatuses = fs.globStatus(path); + if (fileStatuses != null) { + for (FileStatus status : fileStatuses) { + pathList.add(status.getPath()); + } + } else { + throw new FileNotFoundException(fileOrDirName); + } } Collections.sort(pathList); return pathList; } /** - * This method returns a list which contains a path to every given file - * in the input and a path to every file inside a given directory. + * Concatenate the result of {@link #getFiles(String)} applied to all file or directory names. * The list is sorted alphabetically and contains no subdirectories or files within those. - * @param fileOrDirNames A list of filenames and directorynames - * @return A list of Pathes, one for each file + * + * The list is sorted alphabetically. + * @param fileOrDirNames A list of filenames, directorynames or glob patterns + * @return A list of Paths, one for each file * @throws IOException */ static List<Path> getFiles(List<String> fileOrDirNames) http://git-wip-us.apache.org/repos/asf/avro/blob/f8c70a3a/lang/java/tools/src/test/java/org/apache/avro/tool/TestCatTool.java ---------------------------------------------------------------------- diff --git a/lang/java/tools/src/test/java/org/apache/avro/tool/TestCatTool.java b/lang/java/tools/src/test/java/org/apache/avro/tool/TestCatTool.java index 312bd76..39e45de 100644 --- a/lang/java/tools/src/test/java/org/apache/avro/tool/TestCatTool.java +++ b/lang/java/tools/src/test/java/org/apache/avro/tool/TestCatTool.java @@ -180,6 +180,20 @@ public class TestCatTool { args); assertEquals(0, returnCode); assertEquals(LIMIT_WITHIN_INPUT_BOUNDS, numRowsInFile(output)); + +// glob input + args = asList( + new File(input1.getParentFile(), "/*").getAbsolutePath(), + output.getAbsolutePath(), + "--offset" , String.valueOf(OFFSET), + "--limit" , String.valueOf(LIMIT_WITHIN_INPUT_BOUNDS)); + returnCode = new CatTool().run( + System.in, + System.out, + System.err, + args); + assertEquals(0, returnCode); + assertEquals(LIMIT_WITHIN_INPUT_BOUNDS, numRowsInFile(output)); } http://git-wip-us.apache.org/repos/asf/avro/blob/f8c70a3a/lang/java/tools/src/test/java/org/apache/avro/tool/TestConcatTool.java ---------------------------------------------------------------------- diff --git a/lang/java/tools/src/test/java/org/apache/avro/tool/TestConcatTool.java b/lang/java/tools/src/test/java/org/apache/avro/tool/TestConcatTool.java index af31ccb..6fdbddf 100644 --- a/lang/java/tools/src/test/java/org/apache/avro/tool/TestConcatTool.java +++ b/lang/java/tools/src/test/java/org/apache/avro/tool/TestConcatTool.java @@ -25,7 +25,9 @@ import static org.junit.Assert.assertTrue; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.PrintStream; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; @@ -108,6 +110,83 @@ public class TestConcatTool { } @Test + public void testDirConcat() throws Exception { + Map<String, String> metadata = new HashMap<String, String>(); + + File dir = AvroTestUtil.tempDirectory(getClass(), "input"); + + for (int i = 0; i < 3; i++) { + String filename = "input" + i + ".avro"; + File input = generateData(filename, Type.STRING, metadata, DEFLATE); + boolean ok = input.renameTo(new File(dir, input.getName())); + assertTrue(ok); + } + + File output = AvroTestUtil.tempFile(getClass(), "default-output.avro"); + output.deleteOnExit(); + + List<String> args = asList( + dir.getAbsolutePath(), + output.getAbsolutePath()); + int returnCode = new ConcatTool().run( + System.in, + System.out, + System.err, + args); + assertEquals(0, returnCode); + + assertEquals(ROWS_IN_INPUT_FILES * 3, numRowsInFile(output)); + } + + @Test + public void testGlobPatternConcat() throws Exception { + Map<String, String> metadata = new HashMap<String, String>(); + + File dir = AvroTestUtil.tempDirectory(getClass(), "input"); + + for (int i = 0; i < 3; i++) { + String filename = "input" + i + ".avro"; + File input = generateData(filename, Type.STRING, metadata, DEFLATE); + boolean ok = input.renameTo(new File(dir, input.getName())); + assertTrue(ok); + } + + File output = AvroTestUtil.tempFile(getClass(), "default-output.avro"); + output.deleteOnExit(); + + List<String> args = asList( + new File(dir, "/*").getAbsolutePath(), + output.getAbsolutePath()); + int returnCode = new ConcatTool().run( + System.in, + System.out, + System.err, + args); + assertEquals(0, returnCode); + + assertEquals(ROWS_IN_INPUT_FILES * 3, numRowsInFile(output)); + } + + @Test(expected = FileNotFoundException.class) + public void testFileDoesNotExist() throws Exception { + Map<String, String> metadata = new HashMap<String, String>(); + + File dir = AvroTestUtil.tempDirectory(getClass(), "input"); + + File output = AvroTestUtil.tempFile(getClass(), "default-output.avro"); + output.deleteOnExit(); + + List<String> args = asList( + new File(dir, "/doNotExist").getAbsolutePath(), + output.getAbsolutePath()); + new ConcatTool().run( + System.in, + System.out, + System.err, + args); + } + + @Test public void testConcat() throws Exception { Map<String, String> metadata = new HashMap<String, String>(); metadata.put("myMetaKey", "myMetaValue");
