This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new bef54fc85 ORC-1571: Supports displaying raw data size in the `meta` 
command of orc-tools
bef54fc85 is described below

commit bef54fc85acb6ad29d102dbe91f25b36e728fa16
Author: sychen <[email protected]>
AuthorDate: Thu Jan 4 22:48:07 2024 -0800

    ORC-1571: Supports displaying raw data size in the `meta` command of 
orc-tools
    
    ### What changes were proposed in this pull request?
    Display raw data size in `meta` command
    
    ### Why are the changes needed?
    We can directly see the compressed orc file size and uncompressed data size.
    Like `parquet-cli`
    
    ```
    Row group 0:  count: 1000  210.95 B records  start: 4  total(compressed): 
206.006 kB total(uncompressed):10.733 MB
    ```
    
    ### How was this patch tested?
    UT
    
    Closes #1726 from cxzl25/ORC-1571.
    
    Authored-by: sychen <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 java/tools/src/java/org/apache/orc/tools/FileDump.java            | 8 +++++---
 java/tools/src/java/org/apache/orc/tools/JsonFileDump.java        | 2 ++
 java/tools/src/test/resources/orc-file-dump-bloomfilter.out       | 1 +
 java/tools/src/test/resources/orc-file-dump-bloomfilter2.out      | 1 +
 .../src/test/resources/orc-file-dump-dictionary-threshold.out     | 1 +
 java/tools/src/test/resources/orc-file-dump.json                  | 1 +
 java/tools/src/test/resources/orc-file-dump.out                   | 1 +
 java/tools/src/test/resources/orc-file-has-null.out               | 1 +
 8 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java 
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 696dc3c7d..c23505310 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -79,13 +79,13 @@ public final class FileDump {
   }
 
   public static void main(Configuration conf, String[] args) throws Exception {
-    List<Integer> rowIndexCols = new ArrayList<Integer>(0);
+    List<Integer> rowIndexCols = new ArrayList<>(0);
     Options opts = createOptions();
     CommandLine cli = new DefaultParser().parse(opts, args);
 
     if (cli.hasOption('h')) {
       HelpFormatter formatter = new HelpFormatter();
-      formatter.printHelp("orcfiledump", opts);
+      formatter.printHelp("meta", opts);
       return;
     }
 
@@ -103,7 +103,7 @@ public final class FileDump {
         rowIndexCols = null; // All the columns
       } else {
         String[] colStrs = cli.getOptionValue("r").split(",");
-        rowIndexCols = new ArrayList<Integer>(colStrs.length);
+        rowIndexCols = new ArrayList<>(colStrs.length);
         for (String colStr : colStrs) {
           rowIndexCols.add(Integer.parseInt(colStr));
         }
@@ -437,10 +437,12 @@ public final class FileDump {
 
     FileSystem fs = file.getFileSystem(conf);
     long fileLen = fs.getFileStatus(file).getLen();
+    long rawDataSize = reader.getRawDataSize();
     long paddedBytes = getTotalPaddingSize(reader);
     double percentPadding = (fileLen == 0) ? 0.0d : 100.0d * paddedBytes / 
fileLen;
     DecimalFormat format = new DecimalFormat("##.##");
     System.out.println("\nFile length: " + fileLen + " bytes");
+    System.out.println("File raw data size: " + rawDataSize + " bytes");
     System.out.println("Padding length: " + paddedBytes + " bytes");
     System.out.println("Padding ratio: " + format.format(percentPadding) + 
"%");
     //print out any user metadata properties
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java 
b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index 88c1742c8..53fc27498 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -207,10 +207,12 @@ public class JsonFileDump {
 
         FileSystem fs = path.getFileSystem(conf);
         long fileLen = fs.getContentSummary(path).getLength();
+        long rawDataSize = reader.getRawDataSize();
         long paddedBytes = FileDump.getTotalPaddingSize(reader);
         // empty ORC file is ~45 bytes. Assumption here is file length always 
>0
         double percentPadding = ((double) paddedBytes / (double) fileLen) * 
100;
         writer.name("fileLength").value(fileLen);
+        writer.name("rawDataSize").value(rawDataSize);
         writer.name("paddingLength").value(paddedBytes);
         writer.name("paddingRatio").value(percentPadding);
         AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out 
b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
index a2f3fb05c..e9090e37c 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -179,6 +179,7 @@ Stripes:
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 238 
loadFactor: 0.0248 expectedFpp: 5.7562566E-12
 
 File length: 275025 bytes
+File raw data size: 2163000 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 
________________________________________________________________________________________________________________________
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out 
b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
index f70ce5f47..c2ea65962 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -189,6 +189,7 @@ Stripes:
       Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 
loadFactor: 0.5154 expectedFpp: 0.00966294
 
 File length: 332566 bytes
+File raw data size: 2163000 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 
________________________________________________________________________________________________________________________
diff --git 
a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out 
b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
index 1e6e50e5a..ef70b77a5 100644
--- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
+++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
@@ -185,6 +185,7 @@ Stripes:
       Entry 0: count: 1000 hasNull: false min: 
Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-133
 [...]
 
 File length: 2217712 bytes
+File raw data size: 9009000 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 
________________________________________________________________________________________________________________________
diff --git a/java/tools/src/test/resources/orc-file-dump.json 
b/java/tools/src/test/resources/orc-file-dump.json
index 15a9c2495..d94c59bb6 100644
--- a/java/tools/src/test/resources/orc-file-dump.json
+++ b/java/tools/src/test/resources/orc-file-dump.json
@@ -1377,6 +1377,7 @@
     }
   ],
   "fileLength": 275003,
+  "rawDataSize": 2144730,
   "paddingLength": 0,
   "paddingRatio": 0.0,
   "status": "OK"
diff --git a/java/tools/src/test/resources/orc-file-dump.out 
b/java/tools/src/test/resources/orc-file-dump.out
index 6b9e5f928..d1defc4bc 100644
--- a/java/tools/src/test/resources/orc-file-dump.out
+++ b/java/tools/src/test/resources/orc-file-dump.out
@@ -190,6 +190,7 @@ Stripes:
       Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866 
positions: 0,0,0
 
 File length: 271049 bytes
+File raw data size: 2163000 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 
diff --git a/java/tools/src/test/resources/orc-file-has-null.out 
b/java/tools/src/test/resources/orc-file-has-null.out
index da850d007..17d8ea180 100644
--- a/java/tools/src/test/resources/orc-file-has-null.out
+++ b/java/tools/src/test/resources/orc-file-has-null.out
@@ -107,6 +107,7 @@ Stripes:
       Entry 4: count: 0 hasNull: true positions: 0,6,110,0,0,0,0
 
 File length: 1844 bytes
+File raw data size: 770000 bytes
 Padding length: 0 bytes
 Padding ratio: 0%
 
________________________________________________________________________________________________________________________

Reply via email to