(orc) branch branch-1.9 updated: ORC-634: Fix the json output for double NaN and infinite

dongjoon Sat, 03 Feb 2024 12:02:16 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-1.9
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/branch-1.9 by this push:
     new b3cc073c3 ORC-634: Fix the json output for double NaN and infinite
b3cc073c3 is described below

commit b3cc073c3a545aa4535cb87a10d4fb1115801e0c
Author: sychen <[email protected]>
AuthorDate: Sat Feb 3 12:01:15 2024 -0800

    ORC-634: Fix the json output for double NaN and infinite
    
    ### What changes were proposed in this pull request?
    The meta command of tools supports outputting NaN and infinite of Double 
type.
    
    ### Why are the changes needed?
    When ORC's double type data contains NaN or infinite, dump data cannot work 
properly, and outputting meta in json will also fail.
    
    ```java
    java.lang.IllegalArgumentException: Numeric values must be finite, but was 
NaN
            at com.google.gson.stream.JsonWriter.value(JsonWriter.java:505)
            at org.apache.orc.tools.PrintData.printValue(PrintData.java:140)
            at org.apache.orc.tools.PrintData.printRow(PrintData.java:192)
            at org.apache.orc.tools.PrintData.printJsonData(PrintData.java:215)
            at org.apache.orc.tools.PrintData.main(PrintData.java:288)
            at org.apache.orc.tools.FileDump.main(FileDump.java:129)
            at org.apache.orc.tools.FileDump.main(FileDump.java:144)
    ```
    
    ```java
    Exception in thread "main" java.lang.IllegalStateException: Nesting problem.
            at com.google.gson.stream.JsonWriter.beforeName(JsonWriter.java:648)
            at 
com.google.gson.stream.JsonWriter.writeDeferredName(JsonWriter.java:408)
            at com.google.gson.stream.JsonWriter.value(JsonWriter.java:424)
            at 
org.apache.orc.tools.JsonFileDump.printJsonMetaData(JsonFileDump.java:229)
            at org.apache.orc.tools.FileDump.main(FileDump.java:135)
            at org.apache.orc.tools.Driver.main(Driver.java:124)
    ```
    
    ### How was this patch tested?
    add UT
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #1780 from cxzl25/branch-1.9_ORC-634.
    
    Authored-by: sychen <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../java/org/apache/orc/tools/JsonFileDump.java    |  1 +
 .../src/java/org/apache/orc/tools/PrintData.java   |  1 +
 .../test/org/apache/orc/tools/TestFileDump.java    | 36 +++++++++++++++++++
 .../org/apache/orc/tools/TestJsonFileDump.java     | 40 +++++++++++++++++++---
 4 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java 
b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
index 88c1742c8..ff33e4747 100644
--- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
@@ -66,6 +66,7 @@ public class JsonFileDump {
     }
     StringWriter stringWriter = new StringWriter();
     JsonWriter writer = new JsonWriter(stringWriter);
+    writer.setLenient(true);
     if (prettyPrint) {
       writer.setIndent("  ");
     }
diff --git a/java/tools/src/java/org/apache/orc/tools/PrintData.java 
b/java/tools/src/java/org/apache/orc/tools/PrintData.java
index 11075dcba..37a720942 100644
--- a/java/tools/src/java/org/apache/orc/tools/PrintData.java
+++ b/java/tools/src/java/org/apache/orc/tools/PrintData.java
@@ -211,6 +211,7 @@ public class PrintData {
         }
         for (int r=0; r < batch.size; ++r) {
           JsonWriter writer = new JsonWriter(out);
+          writer.setLenient(true);
           printRow(writer, batch, schema, r);
           out.write("\n");
           out.flush();
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java 
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index ce916d27c..da859a68e 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -790,6 +790,42 @@ public class TestFileDump {
     }
   }
 
+  @Test
+  public void testDoubleNaNAndInfinite() throws Exception {
+    TypeDescription schema = TypeDescription.fromString("struct<x:double>");
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .fileSystem(fs)
+            .setSchema(schema));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    DoubleColumnVector x = (DoubleColumnVector) batch.cols[0];
+    int row = batch.size++;
+    x.vector[row] = Double.NaN;
+    row = batch.size++;
+    x.vector[row] = Double.POSITIVE_INFINITY;
+    row = batch.size++;
+    x.vector[row] = 12.34D;
+    if (batch.size != 0) {
+      writer.addRowBatch(batch);
+    }
+    writer.close();
+
+    assertEquals(3, writer.getNumberOfRows());
+
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+
+    // replace stdout and run command
+    System.setOut(new PrintStream(myOut, false, 
StandardCharsets.UTF_8.toString()));
+    FileDump.main(new String[]{testFilePath.toString(), "-d"});
+    System.out.flush();
+    System.setOut(origOut);
+    String[] lines = 
myOut.toString(StandardCharsets.UTF_8.toString()).split("\n");
+    assertEquals("{\"x\":NaN}", lines[0]);
+    assertEquals("{\"x\":Infinity}", lines[1]);
+    assertEquals("{\"x\":12.34}", lines[2]);
+  }
+
   private static boolean contentEquals(String filePath, String otherFilePath) 
throws IOException {
     try (InputStream is = new BufferedInputStream(new 
FileInputStream(filePath));
          InputStream otherIs = new BufferedInputStream(new 
FileInputStream(otherFilePath))) {
diff --git a/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java 
b/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java
index f226ae115..b5eebe606 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestJsonFileDump.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.orc.CompressionKind;
@@ -32,18 +33,15 @@ import org.apache.orc.Writer;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 
-import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.PrintStream;
 import java.net.URL;
 import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Paths;
 import java.util.Random;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNull;
 
 public class TestJsonFileDump {
 
@@ -134,4 +132,38 @@ public class TestJsonFileDump {
 
     TestFileDump.checkOutput(outputFilename, workDir + File.separator + 
outputFilename);
   }
+
+  @Test
+  public void testDoubleNaNAndInfinite() throws Exception {
+    TypeDescription schema = TypeDescription.fromString("struct<x:double>");
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .fileSystem(fs)
+            .setSchema(schema));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    DoubleColumnVector x = (DoubleColumnVector) batch.cols[0];
+    int row = batch.size++;
+    x.vector[row] = Double.NaN;
+    row = batch.size++;
+    x.vector[row] = Double.POSITIVE_INFINITY;
+    row = batch.size++;
+    x.vector[row] = 12.34D;
+    if (batch.size != 0) {
+      writer.addRowBatch(batch);
+    }
+    writer.close();
+
+    assertEquals(3, writer.getNumberOfRows());
+
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+
+    // replace stdout and run command
+    System.setOut(new PrintStream(myOut, false, 
StandardCharsets.UTF_8.toString()));
+    FileDump.main(new String[]{testFilePath.toString(), "-j"});
+    System.out.flush();
+    System.setOut(origOut);
+    String[] lines = 
myOut.toString(StandardCharsets.UTF_8.toString()).split("\n");
+    
assertEquals("{\"fileName\":\"TestFileDump.testDump.orc\",\"fileVersion\":\"0.12\",\"writerVersion\":\"ORC_14\",\"softwareVersion\":\"ORC
 Java 
unknown\",\"numberOfRows\":3,\"compression\":\"ZLIB\",\"compressionBufferSize\":262144,\"schemaString\":\"struct<x:double>\",\"schema\":{\"columnId\":0,\"columnType\":\"STRUCT\",\"children\":{\"x\":{\"columnId\":1,\"columnType\":\"DOUBLE\"}}},\"calendar\":\"Julian/Gregorian\",\"stripeStatistics\":[{\"stripeNumber\":1,\"columnStatistics\":[{\"c
 [...]
+  }
 }

(orc) branch branch-1.9 updated: ORC-634: Fix the json output for double NaN and infinite

Reply via email to