(orc) branch branch-2.0 updated: ORC-1742: Suppor print the id, name and type of each column in dump tool

dongjoon Thu, 11 Jul 2024 08:47:34 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 16ac0e3b5 ORC-1742: Suppor print the id, name and type of each column 
in dump tool
16ac0e3b5 is described below

commit 16ac0e3b574ca0b7453de6c9f096e70dfff04ec3
Author: sychen <[email protected]>
AuthorDate: Thu Jul 11 08:47:13 2024 -0700

    ORC-1742: Suppor print the id, name and type of each column in dump tool
    
    ### What changes were proposed in this pull request?
    This PR aims to suppor print the id, name and type of each column in dump 
tool.
    
    ### Why are the changes needed?
    When we dump an ORC with a complex structure, we only output the column id 
of each column, but we do not know the column name and type corresponding to 
the column id.
    If we use json format, it will output the id, name and type of each column.
    
    ### How was this patch tested?
    Add UT
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #1974 from cxzl25/ORC-1742.
    
    Authored-by: sychen <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
    (cherry picked from commit 8ca3a23975d9018ea3311ba11431303ae091ec5a)
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../src/java/org/apache/orc/tools/FileDump.java    |  28 ++++-
 .../test/org/apache/orc/tools/TestFileDump.java    |  70 ++++++++++++
 .../test/resources/orc-file-dump-column-type.out   | 121 +++++++++++++++++++++
 site/_docs/java-tools.md                           |   3 +
 4 files changed, 218 insertions(+), 4 deletions(-)

diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java 
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 55016ddcd..790352e66 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -134,7 +134,9 @@ public final class FileDump {
         boolean prettyPrint = cli.hasOption('p');
         JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, 
prettyPrint, printTimeZone);
       } else {
-        printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, 
backupPath);
+        boolean printColumnType = cli.hasOption("column-type");
+        printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, 
backupPath,
+            printColumnType);
       }
     }
   }
@@ -268,11 +270,11 @@ public final class FileDump {
 
   private static void printMetaData(List<String> files, Configuration conf,
       List<Integer> rowIndexCols, boolean printTimeZone, final boolean recover,
-      final String backupPath)
+      final String backupPath, final boolean printColumnType)
       throws IOException {
     List<String> corruptFiles = new ArrayList<>();
     for (String filename : files) {
-      printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, 
corruptFiles);
+      printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, 
corruptFiles, printColumnType);
       System.out.println(SEPARATOR);
     }
 
@@ -294,6 +296,15 @@ public final class FileDump {
     }
   }
 
+  static void printColumnsType(TypeDescription schema) {
+    int maximumId = schema.getMaximumId();
+    for (int c = schema.getId(); c < maximumId + 1; ++c) {
+      TypeDescription type = schema.findSubtype(c);
+      System.out.println("  Column " + type.getId() + ": field: " + 
type.getFullFieldName() +
+          " type: " + type.toString());
+    }
+  }
+
   static void printTypeAnnotations(TypeDescription type, String prefix) {
     List<String> attributes = type.getAttributeNames();
     if (attributes.size() > 0) {
@@ -329,7 +340,7 @@ public final class FileDump {
 
   private static void printMetaDataImpl(final String filename,
       final Configuration conf, List<Integer> rowIndexCols, final boolean 
printTimeZone,
-      final List<String> corruptFiles) throws IOException {
+      final List<String> corruptFiles, final boolean printColumnType) throws 
IOException {
     Path file = new Path(filename);
     Reader reader = getReader(file, conf, corruptFiles);
     // if we can create reader then footer is not corrupt and file will 
readable
@@ -351,6 +362,10 @@ public final class FileDump {
                            ? "Proleptic Gregorian"
                            : "Julian/Gregorian"));
     System.out.println("Type: " + reader.getSchema().toString());
+    if (printColumnType) {
+      System.out.println("Columns type:");
+      printColumnsType(reader.getSchema());
+    }
     printTypeAnnotations(reader.getSchema(), "root");
     System.out.println("\nStripe Statistics:");
     List<StripeStatistics> stripeStats = reader.getStripeStatistics();
@@ -835,6 +850,11 @@ public final class FileDump {
         .desc("specify a backup path to store the corrupted files (default: 
/tmp)")
         .hasArg()
         .build());
+
+    result.addOption(Option.builder()
+        .longOpt("column-type")
+        .desc("Print the column id, name and type of each column")
+        .build());
     return result;
   }
 
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java 
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index c265a7400..2699abf40 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
@@ -58,6 +59,7 @@ import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.sql.Timestamp;
 import java.text.SimpleDateFormat;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
@@ -827,6 +829,74 @@ public class TestFileDump {
     assertEquals("{\"x\":12.34}", lines[2]);
   }
 
+  @Test
+  public void testDumpColumnType() throws Exception {
+    TypeDescription schema =
+        
TypeDescription.fromString("struct<a:boolean,b:tinyint,c:smallint,d:int,e:bigint,"
 +
+            
"f:float,g:double,h:string,i:date,j:timestamp,k:binary,l:decimal(20,5),m:varchar(5),"
 +
+            "n:char(5)>");
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .fileSystem(fs)
+            .setSchema(schema));
+
+    VectorizedRowBatch batch = schema.createRowBatch();
+    LongColumnVector a = (LongColumnVector) batch.cols[0];
+    LongColumnVector b = (LongColumnVector) batch.cols[1];
+    LongColumnVector c = (LongColumnVector) batch.cols[2];
+    LongColumnVector d = (LongColumnVector) batch.cols[3];
+    LongColumnVector e = (LongColumnVector) batch.cols[4];
+    DoubleColumnVector f = (DoubleColumnVector) batch.cols[5];
+    DoubleColumnVector g = (DoubleColumnVector) batch.cols[6];
+    BytesColumnVector h = (BytesColumnVector) batch.cols[7];
+    DateColumnVector i = (DateColumnVector) batch.cols[8];
+    TimestampColumnVector j = (TimestampColumnVector) batch.cols[9];
+    BytesColumnVector k = (BytesColumnVector) batch.cols[10];
+    DecimalColumnVector l = (DecimalColumnVector) batch.cols[11];
+    BytesColumnVector m = (BytesColumnVector) batch.cols[12];
+    BytesColumnVector n = (BytesColumnVector) batch.cols[13];
+
+    for (int o = 0; o < VectorizedRowBatch.DEFAULT_SIZE * 2; o++) {
+      int row = batch.size++;
+      a.vector[row] = row % 2;
+      b.vector[row] = row % 128;
+      c.vector[row] = row;
+      d.vector[row] = row;
+      e.vector[row] = row * 10000000L;
+      f.vector[row] = row * 1.0f;
+      g.vector[row] = row * 1.0d;
+      byte[] bytes = String.valueOf(row).getBytes(StandardCharsets.UTF_8);
+      h.setRef(row, bytes, 0, bytes.length);
+      i.vector[row] = row;
+      j.time[row] = row * 1000L;
+      j.nanos[row] = row;
+      k.setRef(row, bytes, 0, bytes.length);
+      l.vector[row] = new HiveDecimalWritable(row);
+      m.setRef(row, bytes, 0, bytes.length);
+      bytes = String.valueOf(10000 - row).getBytes(StandardCharsets.UTF_8);
+      n.setRef(row, bytes, 0, bytes.length);
+
+      if (batch.size == batch.getMaxSize()) {
+        writer.addRowBatch(batch);
+        batch.reset();
+      }
+    }
+    writer.close();
+    assertEquals(VectorizedRowBatch.DEFAULT_SIZE * 2, 
writer.getNumberOfRows());
+
+    PrintStream origOut = System.out;
+    String outputFilename = "orc-file-dump-column-type.out";
+    FileOutputStream myOut = new FileOutputStream(workDir + File.separator + 
outputFilename);
+
+    // replace stdout and run command
+    System.setOut(new PrintStream(myOut, false, 
StandardCharsets.UTF_8.toString()));
+    FileDump.main(new String[]{testFilePath.toString(), "--column-type"});
+    System.out.flush();
+    System.setOut(origOut);
+
+    checkOutput(outputFilename, workDir + File.separator + outputFilename);
+  }
+
   private static boolean contentEquals(String filePath, String otherFilePath) 
throws IOException {
     try (InputStream is = new BufferedInputStream(new 
FileInputStream(filePath));
          InputStream otherIs = new BufferedInputStream(new 
FileInputStream(otherFilePath))) {
diff --git a/java/tools/src/test/resources/orc-file-dump-column-type.out 
b/java/tools/src/test/resources/orc-file-dump-column-type.out
new file mode 100644
index 000000000..73267e728
--- /dev/null
+++ b/java/tools/src/test/resources/orc-file-dump-column-type.out
@@ -0,0 +1,121 @@
+Structure for TestFileDump.testDump.orc
+File Version: 0.12 with ORC_14 by ORC Java 2.1.0-SNAPSHOT
+Rows: 2048
+Compression: ZSTD
+Compression size: 262144
+Calendar: Julian/Gregorian
+Type: 
struct<a:boolean,b:tinyint,c:smallint,d:int,e:bigint,f:float,g:double,h:string,i:date,j:timestamp,k:binary,l:decimal(20,5),m:varchar(5),n:char(5)>
+Columns type:
+  Column 0: field: 0 type: 
struct<a:boolean,b:tinyint,c:smallint,d:int,e:bigint,f:float,g:double,h:string,i:date,j:timestamp,k:binary,l:decimal(20,5),m:varchar(5),n:char(5)>
+  Column 1: field: a type: boolean
+  Column 2: field: b type: tinyint
+  Column 3: field: c type: smallint
+  Column 4: field: d type: int
+  Column 5: field: e type: bigint
+  Column 6: field: f type: float
+  Column 7: field: g type: double
+  Column 8: field: h type: string
+  Column 9: field: i type: date
+  Column 10: field: j type: timestamp
+  Column 11: field: k type: binary
+  Column 12: field: l type: decimal(20,5)
+  Column 13: field: m type: varchar(5)
+  Column 14: field: n type: char(5)
+
+Stripe Statistics:
+  Stripe 1:
+    Column 0: count: 2048 hasNull: false
+    Column 1: count: 2048 hasNull: false bytesOnDisk: 7 true: 1024
+    Column 2: count: 2048 hasNull: false bytesOnDisk: 152 min: 0 max: 127 sum: 
130048
+    Column 3: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 
1047552
+    Column 4: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 
1047552
+    Column 5: count: 2048 hasNull: false bytesOnDisk: 35 min: 0 max: 
10230000000 sum: 10475520000000
+    Column 6: count: 2048 hasNull: false bytesOnDisk: 2361 min: 0.0 max: 
1023.0 sum: 1047552.0
+    Column 7: count: 2048 hasNull: false bytesOnDisk: 973 min: 0.0 max: 1023.0 
sum: 1047552.0
+    Column 8: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 
sum: 5972
+    Column 9: count: 2048 hasNull: false bytesOnDisk: 21 min: Hybrid AD 
1970-01-01 max: Hybrid AD 1972-10-20
+    Column 10: count: 2048 hasNull: false bytesOnDisk: 1626 min: 1969-12-31 
16:00:00.0 max: 1969-12-31 16:17:03.000001023
+    Column 11: count: 2048 hasNull: false bytesOnDisk: 1404 sum: 5972
+    Column 12: count: 2048 hasNull: false bytesOnDisk: 1666 min: 0 max: 1023 
sum: 1047552
+    Column 13: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 
sum: 5972
+    Column 14: count: 2048 hasNull: false bytesOnDisk: 1277 min: 10000 max: 
9999  sum: 10240
+
+File Statistics:
+  Column 0: count: 2048 hasNull: false
+  Column 1: count: 2048 hasNull: false bytesOnDisk: 7 true: 1024
+  Column 2: count: 2048 hasNull: false bytesOnDisk: 152 min: 0 max: 127 sum: 
130048
+  Column 3: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 
1047552
+  Column 4: count: 2048 hasNull: false bytesOnDisk: 21 min: 0 max: 1023 sum: 
1047552
+  Column 5: count: 2048 hasNull: false bytesOnDisk: 35 min: 0 max: 10230000000 
sum: 10475520000000
+  Column 6: count: 2048 hasNull: false bytesOnDisk: 2361 min: 0.0 max: 1023.0 
sum: 1047552.0
+  Column 7: count: 2048 hasNull: false bytesOnDisk: 973 min: 0.0 max: 1023.0 
sum: 1047552.0
+  Column 8: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 
5972
+  Column 9: count: 2048 hasNull: false bytesOnDisk: 21 min: Hybrid AD 
1970-01-01 max: Hybrid AD 1972-10-20
+  Column 10: count: 2048 hasNull: false bytesOnDisk: 1626 min: 1969-12-31 
16:00:00.0 max: 1969-12-31 16:17:03.000001023
+  Column 11: count: 2048 hasNull: false bytesOnDisk: 1404 sum: 5972
+  Column 12: count: 2048 hasNull: false bytesOnDisk: 1666 min: 0 max: 1023 
sum: 1047552
+  Column 13: count: 2048 hasNull: false bytesOnDisk: 2988 min: 0 max: 999 sum: 
5972
+  Column 14: count: 2048 hasNull: false bytesOnDisk: 1277 min: 10000 max: 9999 
 sum: 10240
+
+Stripes:
+  Stripe: offset: 3 data: 15540 rows: 2048 tail: 225 index: 464
+    Stream: column 0 section ROW_INDEX start: 3 length 12
+    Stream: column 1 section ROW_INDEX start: 15 length 24
+    Stream: column 2 section ROW_INDEX start: 39 length 28
+    Stream: column 3 section ROW_INDEX start: 67 length 28
+    Stream: column 4 section ROW_INDEX start: 95 length 28
+    Stream: column 5 section ROW_INDEX start: 123 length 35
+    Stream: column 6 section ROW_INDEX start: 158 length 45
+    Stream: column 7 section ROW_INDEX start: 203 length 45
+    Stream: column 8 section ROW_INDEX start: 248 length 30
+    Stream: column 9 section ROW_INDEX start: 278 length 24
+    Stream: column 10 section ROW_INDEX start: 302 length 35
+    Stream: column 11 section ROW_INDEX start: 337 length 24
+    Stream: column 12 section ROW_INDEX start: 361 length 39
+    Stream: column 13 section ROW_INDEX start: 400 length 30
+    Stream: column 14 section ROW_INDEX start: 430 length 37
+    Stream: column 1 section DATA start: 467 length 7
+    Stream: column 2 section DATA start: 474 length 152
+    Stream: column 3 section DATA start: 626 length 21
+    Stream: column 4 section DATA start: 647 length 21
+    Stream: column 5 section DATA start: 668 length 35
+    Stream: column 6 section DATA start: 703 length 2361
+    Stream: column 7 section DATA start: 3064 length 973
+    Stream: column 8 section DATA start: 4037 length 1575
+    Stream: column 8 section LENGTH start: 5612 length 47
+    Stream: column 8 section DICTIONARY_DATA start: 5659 length 1366
+    Stream: column 9 section DATA start: 7025 length 21
+    Stream: column 10 section DATA start: 7046 length 35
+    Stream: column 10 section SECONDARY start: 7081 length 1591
+    Stream: column 11 section DATA start: 8672 length 1368
+    Stream: column 11 section LENGTH start: 10040 length 36
+    Stream: column 12 section DATA start: 10076 length 1647
+    Stream: column 12 section SECONDARY start: 11723 length 19
+    Stream: column 13 section DATA start: 11742 length 1575
+    Stream: column 13 section LENGTH start: 13317 length 47
+    Stream: column 13 section DICTIONARY_DATA start: 13364 length 1366
+    Stream: column 14 section DATA start: 14730 length 753
+    Stream: column 14 section LENGTH start: 15483 length 11
+    Stream: column 14 section DICTIONARY_DATA start: 15494 length 513
+    Encoding column 0: DIRECT
+    Encoding column 1: DIRECT
+    Encoding column 2: DIRECT
+    Encoding column 3: DIRECT_V2
+    Encoding column 4: DIRECT_V2
+    Encoding column 5: DIRECT_V2
+    Encoding column 6: DIRECT
+    Encoding column 7: DIRECT
+    Encoding column 8: DICTIONARY_V2[1024]
+    Encoding column 9: DIRECT_V2
+    Encoding column 10: DIRECT_V2
+    Encoding column 11: DIRECT_V2
+    Encoding column 12: DIRECT_V2
+    Encoding column 13: DICTIONARY_V2[1024]
+    Encoding column 14: DICTIONARY_V2[1024]
+
+File length: 16919 bytes
+File raw data size: 1048404 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+________________________________________________________________________________________________________________________
+
diff --git a/site/_docs/java-tools.md b/site/_docs/java-tools.md
index f53720113..a3d546e00 100644
--- a/site/_docs/java-tools.md
+++ b/site/_docs/java-tools.md
@@ -142,6 +142,9 @@ equivalent to the Hive ORC File Dump command.
 `--backup-path <path>`
   : when used with --recover specifies the path where the recovered file is 
written (default: /tmp)
 
+`--column-type`
+  : Print the column id, name and type of each column
+
 `-d,--data`
   : Should the data be printed

(orc) branch branch-2.0 updated: ORC-1742: Suppor print the id, name and type of each column in dump tool

Reply via email to