This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 5d163d2f3 ORC-1489: Assign a writer id to CUDF
5d163d2f3 is described below
commit 5d163d2f3a64272a8f6e7e839df1e062df399f0b
Author: zhangyiqun <[email protected]>
AuthorDate: Tue Aug 29 20:33:30 2023 -0700
ORC-1489: Assign a writer id to CUDF
### What changes were proposed in this pull request?
This pr is aimed at assigning a writer id to the CUDF.
### Why are the changes needed?
This helps to locate the writer of a specific orc file, and it also helps
the reader to do some special reads for files created by different writers.
### How was this patch tested?
Added UT
Closes #1594 from guiyanakuang/ORC-1489.
Lead-authored-by: zhangyiqun <[email protected]>
Co-authored-by: Yiqun Zhang <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/include/orc/Common.hh | 1 +
c++/src/Common.cc | 2 ++
c++/src/Reader.cc | 2 +-
java/core/src/java/org/apache/orc/OrcFile.java | 4 ++++
java/core/src/java/org/apache/orc/OrcUtils.java | 3 +++
java/core/src/test/org/apache/orc/TestVectorOrcFile.java | 6 ++++++
proto/orc_proto.proto | 4 ++++
site/specification/ORCv1.md | 1 +
site/specification/ORCv2.md | 1 +
9 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/c++/include/orc/Common.hh b/c++/include/orc/Common.hh
index beae9dd6f..9da67a3f1 100644
--- a/c++/include/orc/Common.hh
+++ b/c++/include/orc/Common.hh
@@ -70,6 +70,7 @@ namespace orc {
PRESTO_WRITER = 2,
SCRITCHLEY_GO = 3,
TRINO_WRITER = 4,
+ CUDF_WRITER = 5,
UNKNOWN_WRITER = INT32_MAX
};
diff --git a/c++/src/Common.cc b/c++/src/Common.cc
index e220e274d..cf2ff27ef 100644
--- a/c++/src/Common.cc
+++ b/c++/src/Common.cc
@@ -82,6 +82,8 @@ namespace orc {
return "Scritchley Go";
case TRINO_WRITER:
return "Trino";
+ case CUDF_WRITER:
+ return "CUDF";
default: {
std::ostringstream buffer;
buffer << "Unknown(" << id << ")";
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index b52675abb..386793f0b 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -622,7 +622,7 @@ namespace orc {
WriterId ReaderImpl::getWriterId() const {
if (footer->has_writer()) {
uint32_t id = footer->writer();
- if (id > WriterId::TRINO_WRITER) {
+ if (id > WriterId::CUDF_WRITER) {
return WriterId::UNKNOWN_WRITER;
} else {
return static_cast<WriterId>(id);
diff --git a/java/core/src/java/org/apache/orc/OrcFile.java
b/java/core/src/java/org/apache/orc/OrcFile.java
index fc164a977..e41e79945 100644
--- a/java/core/src/java/org/apache/orc/OrcFile.java
+++ b/java/core/src/java/org/apache/orc/OrcFile.java
@@ -132,6 +132,7 @@ public class OrcFile {
PRESTO(2), // Presto writer
SCRITCHLEY_GO(3), // Go writer from https://github.com/scritchley/orc
TRINO(4), // Trino writer
+ CUDF(5), // CUDF writer
UNKNOWN(Integer.MAX_VALUE);
private final int id;
@@ -189,6 +190,9 @@ public class OrcFile {
// Trino Writer
TRINO_ORIGINAL(WriterImplementation.TRINO, 6),
+ // CUDF Writer
+ CUDF_ORIGINAL(WriterImplementation.CUDF, 6),
+
// Don't use any magic numbers here except for the below:
FUTURE(WriterImplementation.UNKNOWN, Integer.MAX_VALUE); // a version from
a future writer
diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java
b/java/core/src/java/org/apache/orc/OrcUtils.java
index c121537d3..7dde0bc0f 100644
--- a/java/core/src/java/org/apache/orc/OrcUtils.java
+++ b/java/core/src/java/org/apache/orc/OrcUtils.java
@@ -414,6 +414,9 @@ public class OrcUtils {
case 4:
base = "Trino";
break;
+ case 5:
+ base = "CUDF";
+ break;
default:
base = String.format("Unknown(%d)", writer);
break;
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index 9a1431c68..8eae7a7cd 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -3597,6 +3597,8 @@ public class TestVectorOrcFile {
OrcFile.WriterImplementation.from(2));
assertEquals(OrcFile.WriterImplementation.TRINO,
OrcFile.WriterImplementation.from(4));
+ assertEquals(OrcFile.WriterImplementation.CUDF,
+ OrcFile.WriterImplementation.from(5));
assertEquals(OrcFile.WriterImplementation.UNKNOWN,
OrcFile.WriterImplementation.from(99));
@@ -3615,6 +3617,8 @@ public class TestVectorOrcFile {
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.PRESTO, 6));
assertEquals(OrcFile.WriterVersion.TRINO_ORIGINAL,
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.TRINO, 6));
+ assertEquals(OrcFile.WriterVersion.CUDF_ORIGINAL,
+ OrcFile.WriterVersion.from(OrcFile.WriterImplementation.CUDF, 6));
assertEquals(OrcFile.WriterVersion.FUTURE,
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.UNKNOWN, 0));
@@ -3633,6 +3637,8 @@ public class TestVectorOrcFile {
OrcFile.WriterVersion.PRESTO_ORIGINAL));
assertTrue(OrcFile.WriterVersion.HIVE_12055.includes(
OrcFile.WriterVersion.TRINO_ORIGINAL));
+ assertTrue(OrcFile.WriterVersion.HIVE_12055.includes(
+ OrcFile.WriterVersion.CUDF_ORIGINAL));
}
@ParameterizedTest
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index ff05657a5..45d7d2a05 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -367,6 +367,7 @@ message Footer {
// 2 = Presto
// 3 = Scritchley Go from https://github.com/scritchley/orc
// 4 = Trino
+ // 5 = CUDF
optional uint32 writer = 9;
// information about the encryption in this file
@@ -432,6 +433,9 @@ message PostScript {
// Version of the Trino writer:
// 6 = original
//
+ // Version of the CUDF writer:
+ // 6 = original
+ //
optional uint32 writerVersion = 6;
// the number of bytes in the encrypted stripe statistics
diff --git a/site/specification/ORCv1.md b/site/specification/ORCv1.md
index 28347642e..c9c9311aa 100644
--- a/site/specification/ORCv1.md
+++ b/site/specification/ORCv1.md
@@ -136,6 +136,7 @@ message Footer {
// 2 = Presto
// 3 = Scritchley Go from https://github.com/scritchley/orc
// 4 = Trino
+ // 5 = CUDF
optional uint32 writer = 9;
// information about the encryption in this file
optional Encryption encryption = 10;
diff --git a/site/specification/ORCv2.md b/site/specification/ORCv2.md
index 010de73c9..62d640786 100644
--- a/site/specification/ORCv2.md
+++ b/site/specification/ORCv2.md
@@ -156,6 +156,7 @@ message Footer {
// 2 = Presto
// 3 = Scritchley Go from https://github.com/scritchley/orc
// 4 = Trino
+ // 5 = CUDF
optional uint32 writer = 9;
// information about the encryption in this file
optional Encryption encryption = 10;