This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new bef54fc85 ORC-1571: Supports displaying raw data size in the `meta`
command of orc-tools
bef54fc85 is described below
commit bef54fc85acb6ad29d102dbe91f25b36e728fa16
Author: sychen <[email protected]>
AuthorDate: Thu Jan 4 22:48:07 2024 -0800
ORC-1571: Supports displaying raw data size in the `meta` command of
orc-tools
### What changes were proposed in this pull request?
Display raw data size in `meta` command
### Why are the changes needed?
We can directly see the compressed orc file size and uncompressed data size.
Like `parquet-cli`
```
Row group 0: count: 1000 210.95 B records start: 4 total(compressed):
206.006 kB total(uncompressed):10.733 MB
```
### How was this patch tested?
UT
Closes #1726 from cxzl25/ORC-1571.
Authored-by: sychen <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
java/tools/src/java/org/apache/orc/tools/FileDump.java | 8 +++++---
java/tools/src/java/org/apache/orc/tools/JsonFileDump.java | 2 ++
java/tools/src/test/resources/orc-file-dump-bloomfilter.out | 1 +
java/tools/src/test/resources/orc-file-dump-bloomfilter2.out | 1 +
.../src/test/resources/orc-file-dump-dictionary-threshold.out | 1 +
java/tools/src/test/resources/orc-file-dump.json | 1 +
java/tools/src/test/resources/orc-file-dump.out | 1 +
java/tools/src/test/resources/orc-file-has-null.out | 1 +
8 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 696dc3c7d..c23505310 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -79,13 +79,13 @@ public final class FileDump {
}
public static void main(Configuration conf, String[] args) throws Exception {
- List<Integer> rowIndexCols = new ArrayList<Integer>(0);
+ List<Integer> rowIndexCols = new ArrayList<>(0);
Options opts = createOptions();
CommandLine cli = new DefaultParser().parse(opts, args);
if (cli.hasOption('h')) {
HelpFormatter formatter = new HelpFormatter();
- formatter.printHelp("orcfiledump", opts);
+ formatter.printHelp("meta", opts);
return;
}
@@ -103,7 +103,7 @@ public final class FileDump {
rowIndexCols = null; // All the columns
} else {
String[] colStrs = cli.getOptionValue("r").split(",");
- rowIndexCols = new ArrayList<Integer>(colStrs.length);
+ rowIndexCols = new ArrayList<>(colStrs.length);
for (String colStr : colStrs) {
rowIndexCols.add(Integer.parseInt(colStr));
}
@@ -437,10 +437,12 @@ public final class FileDump {
FileSystem fs = file.getFileSystem(conf);
long fileLen = fs.getFileStatus(file).getLen();
+ long rawDataSize = reader.getRawDataSize();
long paddedBytes = getTotalPaddingSize(reader);
double percentPadding = (fileLen == 0) ? 0.0d : 100.0d * paddedBytes /
fileLen;
DecimalFormat format = new DecimalFormat("##.##");
System.out.println("\nFile length: " + fileLen + " bytes");
+ System.out.println("File raw data size: " + rawDataSize + " bytes");
System.out.println("Padding length: " + paddedBytes + " bytes");
System.out.println("Padding ratio: " + format.format(percentPadding) +
"%");
//print out any user metadata properties
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index 88c1742c8..53fc27498 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -207,10 +207,12 @@ public class JsonFileDump {
FileSystem fs = path.getFileSystem(conf);
long fileLen = fs.getContentSummary(path).getLength();
+ long rawDataSize = reader.getRawDataSize();
long paddedBytes = FileDump.getTotalPaddingSize(reader);
// empty ORC file is ~45 bytes. Assumption here is file length always
>0
double percentPadding = ((double) paddedBytes / (double) fileLen) *
100;
writer.name("fileLength").value(fileLen);
+ writer.name("rawDataSize").value(rawDataSize);
writer.name("paddingLength").value(paddedBytes);
writer.name("paddingRatio").value(percentPadding);
AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
index a2f3fb05c..e9090e37c 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out
@@ -179,6 +179,7 @@ Stripes:
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 238
loadFactor: 0.0248 expectedFpp: 5.7562566E-12
File length: 275025 bytes
+File raw data size: 2163000 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________
diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
index f70ce5f47..c2ea65962 100644
--- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
+++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out
@@ -189,6 +189,7 @@ Stripes:
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948
loadFactor: 0.5154 expectedFpp: 0.00966294
File length: 332566 bytes
+File raw data size: 2163000 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________
diff --git
a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
index 1e6e50e5a..ef70b77a5 100644
--- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
+++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out
@@ -185,6 +185,7 @@ Stripes:
Entry 0: count: 1000 hasNull: false min:
Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-133
[...]
File length: 2217712 bytes
+File raw data size: 9009000 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________
diff --git a/java/tools/src/test/resources/orc-file-dump.json
b/java/tools/src/test/resources/orc-file-dump.json
index 15a9c2495..d94c59bb6 100644
--- a/java/tools/src/test/resources/orc-file-dump.json
+++ b/java/tools/src/test/resources/orc-file-dump.json
@@ -1377,6 +1377,7 @@
}
],
"fileLength": 275003,
+ "rawDataSize": 2144730,
"paddingLength": 0,
"paddingRatio": 0.0,
"status": "OK"
diff --git a/java/tools/src/test/resources/orc-file-dump.out
b/java/tools/src/test/resources/orc-file-dump.out
index 6b9e5f928..d1defc4bc 100644
--- a/java/tools/src/test/resources/orc-file-dump.out
+++ b/java/tools/src/test/resources/orc-file-dump.out
@@ -190,6 +190,7 @@ Stripes:
Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866
positions: 0,0,0
File length: 271049 bytes
+File raw data size: 2163000 bytes
Padding length: 0 bytes
Padding ratio: 0%
diff --git a/java/tools/src/test/resources/orc-file-has-null.out
b/java/tools/src/test/resources/orc-file-has-null.out
index da850d007..17d8ea180 100644
--- a/java/tools/src/test/resources/orc-file-has-null.out
+++ b/java/tools/src/test/resources/orc-file-has-null.out
@@ -107,6 +107,7 @@ Stripes:
Entry 4: count: 0 hasNull: true positions: 0,6,110,0,0,0,0
File length: 1844 bytes
+File raw data size: 770000 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________