This is an automated email from the ASF dual-hosted git repository. dkulp pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/avro.git
The following commit(s) were added to refs/heads/master by this push: new 4318701 AVRO-1720: Add tool to count records in avro files 4318701 is described below commit 4318701a20e8b0131374e63ccd263338ea57891c Author: priesnit <vincenz.priesn...@researchgate.net> AuthorDate: Mon May 11 22:02:37 2020 +0200 AVRO-1720: Add tool to count records in avro files --- .../src/main/java/org/apache/avro/tool/Main.java | 15 ++-- .../java/org/apache/avro/tool/RecordCountTool.java | 89 +++++++++++++++++++ .../org/apache/avro/tool/TestRecordCountTool.java | 99 ++++++++++++++++++++++ 3 files changed, 196 insertions(+), 7 deletions(-) diff --git a/lang/java/tools/src/main/java/org/apache/avro/tool/Main.java b/lang/java/tools/src/main/java/org/apache/avro/tool/Main.java index 96ad87b..59ae796 100644 --- a/lang/java/tools/src/main/java/org/apache/avro/tool/Main.java +++ b/lang/java/tools/src/main/java/org/apache/avro/tool/Main.java @@ -36,13 +36,14 @@ public class Main { Main() { tools = new TreeMap<>(); - for (Tool tool : new Tool[] { new CatTool(), new SpecificCompilerTool(), new InduceSchemaTool(), - new JsonToBinaryFragmentTool(), new BinaryFragmentToJsonTool(), new CreateRandomFileTool(), - new DataFileReadTool(), new DataFileWriteTool(), new DataFileGetMetaTool(), new DataFileGetSchemaTool(), - new DataFileRepairTool(), new IdlTool(), new IdlToSchemataTool(), new RecodecTool(), new ConcatTool(), - new RpcReceiveTool(), new RpcSendTool(), new RpcProtocolTool(), new FromTextTool(), new ToTextTool(), - new ToTrevniTool(), new TetherTool(), new TrevniCreateRandomTool(), new TrevniMetadataTool(), - new TrevniToJsonTool(), new SchemaNormalizationTool(), new SchemaFingerprintTool() }) { + for (Tool tool : new Tool[] { new CatTool(), new RecordCountTool(), new SpecificCompilerTool(), + new InduceSchemaTool(), new JsonToBinaryFragmentTool(), new BinaryFragmentToJsonTool(), + new CreateRandomFileTool(), new DataFileReadTool(), new DataFileWriteTool(), new DataFileGetMetaTool(), + new DataFileGetSchemaTool(), new DataFileRepairTool(), new IdlTool(), new IdlToSchemataTool(), + new RecodecTool(), new ConcatTool(), new RpcReceiveTool(), new RpcSendTool(), new RpcProtocolTool(), + new FromTextTool(), new ToTextTool(), new ToTrevniTool(), new TetherTool(), new TrevniCreateRandomTool(), + new TrevniMetadataTool(), new TrevniToJsonTool(), new SchemaNormalizationTool(), + new SchemaFingerprintTool() }) { Tool prev = tools.put(tool.getName(), tool); if (prev != null) { throw new AssertionError("Two tools with identical names: " + tool + ", " + prev); diff --git a/lang/java/tools/src/main/java/org/apache/avro/tool/RecordCountTool.java b/lang/java/tools/src/main/java/org/apache/avro/tool/RecordCountTool.java new file mode 100644 index 0000000..2b6f516 --- /dev/null +++ b/lang/java/tools/src/main/java/org/apache/avro/tool/RecordCountTool.java @@ -0,0 +1,89 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro.tool; + +import com.google.common.collect.ImmutableList; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import org.apache.avro.file.DataFileStream; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.hadoop.fs.Path; + +import java.io.InputStream; +import java.io.PrintStream; +import java.util.List; + +/** Counts the records in avro files or folders */ +public class RecordCountTool implements Tool { + + @Override + public String getName() { + return "count"; + } + + @Override + public String getShortDescription() { + return "Counts the records in avro files or folders"; + } + + @Override + public int run(InputStream stdin, PrintStream out, PrintStream err, List<String> args) throws Exception { + OptionParser optionParser = new OptionParser(); + OptionSet optionSet = optionParser.parse(args.toArray(new String[0])); + List<String> nargs = (List<String>) optionSet.nonOptionArguments(); + + if (nargs.isEmpty()) { + printHelp(err); + err.println(); + optionParser.printHelpOn(err); + return 0; + } + + long count = 0L; + if (ImmutableList.of("-").equals(nargs)) { + count = countRecords(stdin); + } else { + for (Path file : Util.getFiles(nargs)) { + try (final InputStream inStream = Util.openFromFS(file)) { + count += countRecords(inStream); + } + } + } + out.println(count); + out.flush(); + return 0; + } + + private long countRecords(InputStream inStream) throws java.io.IOException { + long count = 0L; + try (DataFileStream<Object> streamReader = new DataFileStream<>(inStream, new GenericDatumReader<>())) { + while (streamReader.hasNext()) { + count = count + streamReader.getBlockCount(); + streamReader.nextBlock(); + } + } + return count; + } + + private void printHelp(PrintStream ps) { + ps.println(getName() + " [input-files...]"); + ps.println(); + ps.println(getShortDescription()); + ps.println("A dash ('-') can be given as an input-file to use stdin"); + } +} diff --git a/lang/java/tools/src/test/java/org/apache/avro/tool/TestRecordCountTool.java b/lang/java/tools/src/test/java/org/apache/avro/tool/TestRecordCountTool.java new file mode 100644 index 0000000..af3b887 --- /dev/null +++ b/lang/java/tools/src/test/java/org/apache/avro/tool/TestRecordCountTool.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro.tool; + +import org.apache.avro.Schema; +import org.apache.avro.Schema.Type; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericDatumWriter; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.rules.TestName; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.junit.Assert.assertEquals; + +public class TestRecordCountTool { + + @Rule + public TestName name = new TestName(); + @Rule + public TemporaryFolder temporaryFolder = new TemporaryFolder(); + + private File generateData(int numRecords) throws Exception { + final File tempFile = temporaryFolder.newFile(); + + Schema schema = Schema.create(Type.STRING); + try (DataFileWriter<Object> writer = new DataFileWriter<>(new GenericDatumWriter<>(schema))) { + writer.create(schema, tempFile); + + // ~10 records per block + writer.setSyncInterval(60); + for (int i = 0; i < numRecords; i++) { + writer.append("foobar"); + } + } + return tempFile; + } + + @Test(expected = FileNotFoundException.class) + public void testFileDoesNotExist() throws Exception { + List<String> args = Collections + .singletonList(new File(temporaryFolder.getRoot(), "nonExistingFile").getAbsolutePath()); + int returnCode = new RecordCountTool().run(System.in, System.out, System.err, args); + assertEquals(1, returnCode); + } + + @Test + public void testBasic() throws Exception { + final List<Integer> inputSizes = IntStream.range(0, 20).boxed().collect(Collectors.toList()); + for (Integer inputSize : inputSizes) { + File inputFile = generateData(inputSize); + List<String> args = Collections.singletonList(inputFile.getAbsolutePath()); + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + int returnCode = new RecordCountTool().run(System.in, new PrintStream(out), System.err, args); + + assertEquals(0, returnCode); + assertEquals(inputSize.toString() + System.lineSeparator(), out.toString()); + } + } + + @Test + public void testMultipleFiles() throws Exception { + File f1 = generateData(20); + File f2 = generateData(200); + + List<String> args = Arrays.asList(f1.getAbsolutePath(), f2.getAbsolutePath()); + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + int returnCode = new RecordCountTool().run(System.in, new PrintStream(out), System.err, args); + + assertEquals(0, returnCode); + assertEquals("220" + System.lineSeparator(), out.toString()); + } + +}