This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new dc2449b08 ORC-634: Fix the json output for double NaN and infinite
dc2449b08 is described below
commit dc2449b08ec05636ccad2753b6ecd9567e4e2989
Author: sychen <[email protected]>
AuthorDate: Wed Jan 31 09:10:16 2024 -0800
ORC-634: Fix the json output for double NaN and infinite
### What changes were proposed in this pull request?
The meta command of tools supports outputting NaN and infinite of Double
type.
### Why are the changes needed?
When ORC's double type data contains NaN or infinite, dump data cannot work
properly, and outputting meta in json will also fail.
```java
java.lang.IllegalArgumentException: Numeric values must be finite, but was
NaN
at com.google.gson.stream.JsonWriter.value(JsonWriter.java:505)
at org.apache.orc.tools.PrintData.printValue(PrintData.java:140)
at org.apache.orc.tools.PrintData.printRow(PrintData.java:192)
at org.apache.orc.tools.PrintData.printJsonData(PrintData.java:215)
at org.apache.orc.tools.PrintData.main(PrintData.java:288)
at org.apache.orc.tools.FileDump.main(FileDump.java:129)
at org.apache.orc.tools.FileDump.main(FileDump.java:144)
```
```java
Exception in thread "main" java.lang.IllegalStateException: Nesting problem.
at com.google.gson.stream.JsonWriter.beforeName(JsonWriter.java:648)
at
com.google.gson.stream.JsonWriter.writeDeferredName(JsonWriter.java:408)
at com.google.gson.stream.JsonWriter.value(JsonWriter.java:424)
at
org.apache.orc.tools.JsonFileDump.printJsonMetaData(JsonFileDump.java:229)
at org.apache.orc.tools.FileDump.main(FileDump.java:135)
at org.apache.orc.tools.Driver.main(Driver.java:124)
```
### How was this patch tested?
add UT
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #1770 from cxzl25/ORC-634.
Authored-by: sychen <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../java/org/apache/orc/tools/JsonFileDump.java | 1 +
.../src/java/org/apache/orc/tools/PrintData.java | 1 +
.../test/org/apache/orc/tools/TestFileDump.java | 36 ++++++++++++++++++++
.../org/apache/orc/tools/TestJsonFileDump.java | 39 ++++++++++++++++++++++
4 files changed, 77 insertions(+)
diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index 53fc27498..d6166ea91 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -66,6 +66,7 @@ public class JsonFileDump {
}
StringWriter stringWriter = new StringWriter();
JsonWriter writer = new JsonWriter(stringWriter);
+ writer.setLenient(true);
if (prettyPrint) {
writer.setIndent(" ");
}
diff --git a/java/tools/src/java/org/apache/orc/tools/PrintData.java
b/java/tools/src/java/org/apache/orc/tools/PrintData.java
index 11075dcba..37a720942 100644
--- a/java/tools/src/java/org/apache/orc/tools/PrintData.java
+++ b/java/tools/src/java/org/apache/orc/tools/PrintData.java
@@ -211,6 +211,7 @@ public class PrintData {
}
for (int r=0; r < batch.size; ++r) {
JsonWriter writer = new JsonWriter(out);
+ writer.setLenient(true);
printRow(writer, batch, schema, r);
out.write("\n");
out.flush();
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index 5db444de0..c265a7400 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -791,6 +791,42 @@ public class TestFileDump {
}
}
+ @Test
+ public void testDoubleNaNAndInfinite() throws Exception {
+ TypeDescription schema = TypeDescription.fromString("struct<x:double>");
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ DoubleColumnVector x = (DoubleColumnVector) batch.cols[0];
+ int row = batch.size++;
+ x.vector[row] = Double.NaN;
+ row = batch.size++;
+ x.vector[row] = Double.POSITIVE_INFINITY;
+ row = batch.size++;
+ x.vector[row] = 12.34D;
+ if (batch.size != 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+
+ assertEquals(3, writer.getNumberOfRows());
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8));
+ FileDump.main(new String[]{testFilePath.toString(), "-d"});
+ System.out.flush();
+ System.setOut(origOut);
+ String[] lines = myOut.toString(StandardCharsets.UTF_8).split("\n");
+ assertEquals("{\"x\":NaN}", lines[0]);
+ assertEquals("{\"x\":Infinity}", lines[1]);
+ assertEquals("{\"x\":12.34}", lines[2]);
+ }
+
private static boolean contentEquals(String filePath, String otherFilePath)
throws IOException {
try (InputStream is = new BufferedInputStream(new
FileInputStream(filePath));
InputStream otherIs = new BufferedInputStream(new
FileInputStream(otherFilePath))) {
diff --git a/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java
b/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java
index 609129e90..f21358516 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.CompressionKind;
@@ -32,12 +33,16 @@ import org.apache.orc.Writer;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
+import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Random;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
public class TestJsonFileDump {
public static String getFileFromClasspath(String name) {
@@ -127,4 +132,38 @@ public class TestJsonFileDump {
TestFileDump.checkOutput(outputFilename, workDir + File.separator +
outputFilename);
}
+
+ @Test
+ public void testDoubleNaNAndInfinite() throws Exception {
+ TypeDescription schema = TypeDescription.fromString("struct<x:double>");
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ DoubleColumnVector x = (DoubleColumnVector) batch.cols[0];
+ int row = batch.size++;
+ x.vector[row] = Double.NaN;
+ row = batch.size++;
+ x.vector[row] = Double.POSITIVE_INFINITY;
+ row = batch.size++;
+ x.vector[row] = 12.34D;
+ if (batch.size != 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+
+ assertEquals(3, writer.getNumberOfRows());
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8));
+ FileDump.main(new String[]{testFilePath.toString(), "-j"});
+ System.out.flush();
+ System.setOut(origOut);
+ String[] lines = myOut.toString(StandardCharsets.UTF_8).split("\n");
+
assertEquals("{\"fileName\":\"TestFileDump.testDump.orc\",\"fileVersion\":\"0.12\",\"writerVersion\":\"ORC_14\",\"softwareVersion\":\"ORC
Java
unknown\",\"numberOfRows\":3,\"compression\":\"ZSTD\",\"compressionBufferSize\":262144,\"schemaString\":\"struct<x:double>\",\"schema\":{\"columnId\":0,\"columnType\":\"STRUCT\",\"children\":{\"x\":{\"columnId\":1,\"columnType\":\"DOUBLE\"}}},\"calendar\":\"Julian/Gregorian\",\"stripeStatistics\":[{\"stripeNumber\":1,\"columnStatistics\":[{\"c
[...]
+ }
}