Author: gates Date: Fri Dec 5 21:13:34 2014 New Revision: 1643436 URL: http://svn.apache.org/r1643436 Log: HIVE-7896 orcfiledump should be able to dump data (Alan Gates, reviewed by Prasanth Jayachandran)
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java?rev=1643436&r1=1643435&r2=1643436&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java Fri Dec 5 21:13:34 2014 @@ -17,20 +17,34 @@ */ package org.apache.hadoop.hive.ql.io.orc; +import java.io.OutputStreamWriter; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.io.IOException; import java.text.DecimalFormat; -import java.util.List; +import java.util.Map; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndex; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; -import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.codehaus.jettison.json.JSONException; +import org.codehaus.jettison.json.JSONWriter; /** * A tool for printing out the file structure of ORC files. @@ -43,24 +57,40 @@ public final class FileDump { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); - List<String> files = new ArrayList<String>(); + List<Integer> rowIndexCols = null; - for (String arg : args) { - if (arg.startsWith("--")) { - if (arg.startsWith(ROWINDEX_PREFIX)) { - String[] colStrs = arg.substring(ROWINDEX_PREFIX.length()).split(","); - rowIndexCols = new ArrayList<Integer>(colStrs.length); - for (String colStr : colStrs) { - rowIndexCols.add(Integer.parseInt(colStr)); - } - } else { - System.err.println("Unknown argument " + arg); - } - } else { - files.add(arg); + Options opts = createOptions(); + CommandLine cli = new GnuParser().parse(opts, args); + + if (cli.hasOption('h')) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp("orcfiledump", opts); + return; + } + + boolean dumpData = cli.hasOption('d'); + if (cli.hasOption("rowindex")) { + String[] colStrs = cli.getOptionValue("rowindex").split(","); + rowIndexCols = new ArrayList<Integer>(colStrs.length); + for (String colStr : colStrs) { + rowIndexCols.add(Integer.parseInt(colStr)); } } + String[] files = cli.getArgs(); + if (dumpData) printData(Arrays.asList(files), conf); + else printMetaData(Arrays.asList(files), conf, rowIndexCols); + } + + private static void printData(List<String> files, Configuration conf) throws IOException, + JSONException { + for (String file : files) { + printJsonData(conf, file); + } + } + + private static void printMetaData(List<String> files, Configuration conf, + List<Integer> rowIndexCols) throws IOException { for (String filename : files) { System.out.println("Structure for " + filename); Path path = new Path(filename); @@ -181,4 +211,149 @@ public final class FileDump { } return paddedBytes; } + + static Options createOptions() { + Options result = new Options(); + + // add -d and --data to print the rows + result.addOption(OptionBuilder + .withLongOpt("data") + .withDescription("Should the data be printed") + .create('d')); + + result.addOption(OptionBuilder + .withLongOpt("help") + .withDescription("print help message") + .create('h')); + + result.addOption(OptionBuilder + .withLongOpt("rowindex") + .withArgName("comma separated list of column ids for which row index should be printed") + .withDescription("Dump stats for column number(s)") + .hasArg() + .create()); + + + return result; + } + + private static void printMap(JSONWriter writer, + Map<Object, Object> obj, + List<OrcProto.Type> types, + OrcProto.Type type + ) throws IOException, JSONException { + writer.array(); + int keyType = type.getSubtypes(0); + int valueType = type.getSubtypes(1); + for(Map.Entry<Object,Object> item: obj.entrySet()) { + writer.object(); + writer.key("_key"); + printObject(writer, item.getKey(), types, keyType); + writer.key("_value"); + printObject(writer, item.getValue(), types, valueType); + writer.endObject(); + } + writer.endArray(); + } + + private static void printList(JSONWriter writer, + List<Object> obj, + List<OrcProto.Type> types, + OrcProto.Type type + ) throws IOException, JSONException { + int subtype = type.getSubtypes(0); + writer.array(); + for(Object item: obj) { + printObject(writer, item, types, subtype); + } + writer.endArray(); + } + + private static void printUnion(JSONWriter writer, + OrcUnion obj, + List<OrcProto.Type> types, + OrcProto.Type type + ) throws IOException, JSONException { + int subtype = type.getSubtypes(obj.getTag()); + printObject(writer, obj.getObject(), types, subtype); + } + + static void printStruct(JSONWriter writer, + OrcStruct obj, + List<OrcProto.Type> types, + OrcProto.Type type) throws IOException, JSONException { + writer.object(); + List<Integer> fieldTypes = type.getSubtypesList(); + for(int i=0; i < fieldTypes.size(); ++i) { + writer.key(type.getFieldNames(i)); + printObject(writer, obj.getFieldValue(i), types, fieldTypes.get(i)); + } + writer.endObject(); + } + + static void printObject(JSONWriter writer, + Object obj, + List<OrcProto.Type> types, + int typeId) throws IOException, JSONException { + OrcProto.Type type = types.get(typeId); + if (obj == null) { + writer.value(null); + } else { + switch (type.getKind()) { + case STRUCT: + printStruct(writer, (OrcStruct) obj, types, type); + break; + case UNION: + printUnion(writer, (OrcUnion) obj, types, type); + break; + case LIST: + printList(writer, (List<Object>) obj, types, type); + break; + case MAP: + printMap(writer, (Map<Object, Object>) obj, types, type); + break; + case BYTE: + writer.value(((ByteWritable) obj).get()); + break; + case SHORT: + writer.value(((ShortWritable) obj).get()); + break; + case INT: + writer.value(((IntWritable) obj).get()); + break; + case LONG: + writer.value(((LongWritable) obj).get()); + break; + case FLOAT: + writer.value(((FloatWritable) obj).get()); + break; + case DOUBLE: + writer.value(((DoubleWritable) obj).get()); + break; + case BOOLEAN: + writer.value(((BooleanWritable) obj).get()); + break; + default: + writer.value(obj.toString()); + break; + } + } + } + + static void printJsonData(Configuration conf, + String filename) throws IOException, JSONException { + Path path = new Path(filename); + Reader reader = OrcFile.createReader(path.getFileSystem(conf), path); + OutputStreamWriter out = new OutputStreamWriter(System.out, "UTF-8"); + RecordReader rows = reader.rows(null); + Object row = null; + List<OrcProto.Type> types = reader.getTypes(); + while (rows.hasNext()) { + row = rows.next(row); + JSONWriter writer = new JSONWriter(out); + printObject(writer, row, types, 0); + out.write("\n"); + out.flush(); + } + } } Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java?rev=1643436&r1=1643435&r2=1643436&view=diff ============================================================================== --- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java (original) +++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java Fri Dec 5 21:13:34 2014 @@ -22,19 +22,32 @@ import static org.junit.Assert.assertEqu import static org.junit.Assert.assertNull; import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.PrintStream; +import java.math.BigDecimal; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.Random; +import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hive.common.util.HiveTestUtils; +import org.fusesource.leveldbjni.All; import org.junit.Before; import org.junit.Test; @@ -65,6 +78,55 @@ public class TestFileDump { } } + static class AllTypesRecord { + static class Struct { + int i; + String s; + + Struct(int i, String s) { + this.i = i; + this.s = s; + } + } + boolean b; + byte bt; + short s; + int i; + long l; + float f; + double d; + HiveDecimal de; + Timestamp t; + Date dt; + String str; + HiveChar c; + HiveVarchar vc; + Map<String, String> m; + List<Integer> a; + Struct st; + + AllTypesRecord(boolean b, byte bt, short s, int i, long l, float f, double d, HiveDecimal de, + Timestamp t, Date dt, String str, HiveChar c, HiveVarchar vc, Map<String, + String> m, List<Integer> a, Struct st) { + this.b = b; + this.bt = bt; + this.s = s; + this.i = i; + this.l = l; + this.f = f; + this.d = d; + this.de = de; + this.t = t; + this.dt = dt; + this.str = str; + this.c = c; + this.vc = vc; + this.m = m; + this.a = a; + this.st = st; + } + } + private static void checkOutput(String expected, String actual) throws Exception { BufferedReader eStream = @@ -124,6 +186,72 @@ public class TestFileDump { checkOutput(outputFilename, workDir + File.separator + outputFilename); } + @Test + public void testDataDump() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (AllTypesRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, + 100000, CompressionKind.NONE, 10000, 1000); + Map<String, String> m = new HashMap<String, String>(2); + m.put("k1", "v1"); + writer.addRow(new AllTypesRecord( + true, + (byte) 10, + (short) 100, + 1000, + 10000L, + 4.0f, + 20.0, + HiveDecimal.create(new BigDecimal(4.2222)), + new Timestamp(1416967764000L), + new Date(1416967764000L), + "string", + new HiveChar("hello", 5), + new HiveVarchar("hello", 10), + m, + Arrays.asList(100, 200), + new AllTypesRecord.Struct(10, "foo"))); + m.clear(); + m.put("k3", "v3"); + writer.addRow(new AllTypesRecord( + false, + (byte)20, + (short)200, + 2000, + 20000L, + 8.0f, + 40.0, + HiveDecimal.create(new BigDecimal(2.2222)), + new Timestamp(1416967364000L), + new Date(1411967764000L), + "abcd", + new HiveChar("world", 5), + new HiveVarchar("world", 10), + m, + Arrays.asList(200, 300), + new AllTypesRecord.Struct(20, "bar"))); + + writer.close(); + PrintStream origOut = System.out; + String outputFilename = "orc-file-dump.out"; + ByteArrayOutputStream myOut = new ByteArrayOutputStream(); + + // replace stdout and run command + System.setOut(new PrintStream(myOut)); + FileDump.main(new String[]{testFilePath.toString(), "-d"}); + System.out.flush(); + System.setOut(origOut); + + String[] lines = myOut.toString().split("\n"); + // Don't be fooled by the big space in the middle, this line is quite long + assertEquals("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.222199999999999953\",\"t\":\"2014-11-25 18:09:24\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello \",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]); + assertEquals("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.222199999999999953\",\"t\":\"2014-11-25 18:02:44\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world \",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]); + + } + // Test that if the fraction of rows that have distinct strings is greater than the configured // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length // of the dictionary stream for the column will be 0 in the ORC file dump.