This is an automated email from the ASF dual-hosted git repository.

maytasm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git


The following commit(s) were added to refs/heads/master by this push:
     new 55acf2e2ff4 Fix incorrect scale when reading decimal from parquet 
(#15715)
55acf2e2ff4 is described below

commit 55acf2e2ff49cb14eee353a474805746368a877d
Author: Maytas Monsereenusorn <[email protected]>
AuthorDate: Thu Jan 18 02:10:27 2024 -0800

    Fix incorrect scale when reading decimal from parquet (#15715)
    
    * Fix incorrect scale when reading decimal from parquet
    
    * add comments
    
    * fix test
---
 .../parquet/simple/ParquetGroupConverter.java      |   8 +-
 .../input/parquet/DecimalParquetInputTest.java     |  95 ++++++++++++++++++-
 .../input/parquet/DecimalParquetReaderTest.java    | 102 +++++++++++++++++++--
 3 files changed, 193 insertions(+), 12 deletions(-)

diff --git 
a/extensions-core/parquet-extensions/src/main/java/org/apache/druid/data/input/parquet/simple/ParquetGroupConverter.java
 
b/extensions-core/parquet-extensions/src/main/java/org/apache/druid/data/input/parquet/simple/ParquetGroupConverter.java
index 4571da3d724..8700359cf79 100644
--- 
a/extensions-core/parquet-extensions/src/main/java/org/apache/druid/data/input/parquet/simple/ParquetGroupConverter.java
+++ 
b/extensions-core/parquet-extensions/src/main/java/org/apache/druid/data/input/parquet/simple/ParquetGroupConverter.java
@@ -426,9 +426,13 @@ public class ParquetGroupConverter
             int scale = pt.asPrimitiveType().getDecimalMetadata().getScale();
             switch (pt.getPrimitiveTypeName()) {
               case INT32:
-                return new BigDecimal(g.getInteger(fieldIndex, index));
+                // The primitive returned from Group is an unscaledValue.
+                // We need to do unscaledValue * 10^(-scale) to convert back 
to decimal
+                return new BigDecimal(g.getInteger(fieldIndex, 
index)).movePointLeft(scale);
               case INT64:
-                return new BigDecimal(g.getLong(fieldIndex, index));
+                // The primitive returned from Group is an unscaledValue.
+                // We need to do unscaledValue * 10^(-scale) to convert back 
to decimal
+                return new BigDecimal(g.getLong(fieldIndex, 
index)).movePointLeft(scale);
               case FIXED_LEN_BYTE_ARRAY:
               case BINARY:
                 Binary value = g.getBinary(fieldIndex, index);
diff --git 
a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetInputTest.java
 
b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetInputTest.java
index 7f60fbf066f..37c99f53baf 100644
--- 
a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetInputTest.java
+++ 
b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetInputTest.java
@@ -62,6 +62,35 @@ public class DecimalParquetInputTest extends 
BaseParquetInputTest
         parserType,
         true
     );
+    /*
+    The raw data in the parquet file has the following columns:
+    ############ Column(fixed_len_dec) ############
+    name: fixed_len_dec
+    path: fixed_len_dec
+    max_definition_level: 1
+    max_repetition_level: 0
+    physical_type: FIXED_LEN_BYTE_ARRAY
+    logical_type: Decimal(precision=10, scale=2)
+    converted_type (legacy): DECIMAL
+
+    The raw data in the parquet file has the following rows:
+    0.0
+    1.0
+    2.0
+    3.0
+    4.0
+    5.0
+    6.0
+    7.0
+    8.0
+    9.0
+    0.0
+    1.0
+    2.0
+    3.0
+    4.0
+    5.0
+     */
     List<InputRow> rows = getAllRows(parserType, config);
     Assert.assertEquals("2018-09-01T00:00:00.000Z", 
rows.get(0).getTimestamp().toString());
     Assert.assertEquals("1.0", 
rows.get(0).getDimension("fixed_len_dec").get(0));
@@ -80,10 +109,39 @@ public class DecimalParquetInputTest extends 
BaseParquetInputTest
         parserType,
         true
     );
+    /*
+    The raw data in the parquet file has the following columns:
+    ############ Column(i32_dec) ############
+    name: i32_dec
+    path: i32_dec
+    max_definition_level: 1
+    max_repetition_level: 0
+    physical_type: INT32
+    logical_type: Decimal(precision=5, scale=2)
+    converted_type (legacy): DECIMAL
+
+    The raw data in the parquet file has the following rows:
+    0
+    1.00
+    2.00
+    3.00
+    4.00
+    5.00
+    6.00
+    7.00
+    8.00
+    9.00
+    0
+    1.00
+    2.00
+    3.00
+    4.00
+    5.00
+     */
     List<InputRow> rows = getAllRows(parserType, config);
     Assert.assertEquals("2018-09-01T00:00:00.000Z", 
rows.get(0).getTimestamp().toString());
-    Assert.assertEquals("100", rows.get(0).getDimension("i32_dec").get(0));
-    Assert.assertEquals(new BigDecimal(100), rows.get(0).getMetric("metric1"));
+    Assert.assertEquals("1.00", rows.get(0).getDimension("i32_dec").get(0));
+    Assert.assertEquals(BigDecimal.valueOf(100L, 2), 
rows.get(0).getMetric("metric1"));
   }
 
   @Test
@@ -98,9 +156,38 @@ public class DecimalParquetInputTest extends 
BaseParquetInputTest
         parserType,
         true
     );
+    /*
+    The raw data in the parquet file has the following columns:
+    ############ Column(i64_dec) ############
+    name: i64_dec
+    path: i64_dec
+    max_definition_level: 1
+    max_repetition_level: 0
+    physical_type: INT64
+    logical_type: Decimal(precision=10, scale=2)
+    converted_type (legacy): DECIMAL
+
+    The raw data in the parquet file has the following rows:
+    0
+    1.00
+    2.00
+    3.00
+    4.00
+    5.00
+    6.00
+    7.00
+    8.00
+    9.00
+    0
+    1.00
+    2.00
+    3.00
+    4.00
+    5.00
+     */
     List<InputRow> rows = getAllRows(parserType, config);
     Assert.assertEquals("2018-09-01T00:00:00.000Z", 
rows.get(0).getTimestamp().toString());
-    Assert.assertEquals("100", rows.get(0).getDimension("i64_dec").get(0));
-    Assert.assertEquals(new BigDecimal(100), rows.get(0).getMetric("metric1"));
+    Assert.assertEquals("1.00", rows.get(0).getDimension("i64_dec").get(0));
+    Assert.assertEquals(BigDecimal.valueOf(100L, 2), 
rows.get(0).getMetric("metric1"));
   }
 }
diff --git 
a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java
 
b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java
index faa80e6d73f..0d56e1e3652 100644
--- 
a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java
+++ 
b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java
@@ -63,6 +63,36 @@ public class DecimalParquetReaderTest extends 
BaseParquetReaderTest
         flattenSpec
     );
 
+    /*
+    The raw data in the parquet file has the following columns:
+    ############ Column(fixed_len_dec) ############
+    name: fixed_len_dec
+    path: fixed_len_dec
+    max_definition_level: 1
+    max_repetition_level: 0
+    physical_type: FIXED_LEN_BYTE_ARRAY
+    logical_type: Decimal(precision=10, scale=2)
+    converted_type (legacy): DECIMAL
+
+    The raw data in the parquet file has the following rows:
+    0.0
+    1.0
+    2.0
+    3.0
+    4.0
+    5.0
+    6.0
+    7.0
+    8.0
+    9.0
+    0.0
+    1.0
+    2.0
+    3.0
+    4.0
+    5.0
+     */
+
     List<InputRow> rows = readAllRows(reader);
     Assert.assertEquals("2018-09-01T00:00:00.000Z", 
rows.get(1).getTimestamp().toString());
     Assert.assertEquals("1.0", 
rows.get(1).getDimension("fixed_len_dec").get(0));
@@ -100,10 +130,40 @@ public class DecimalParquetReaderTest extends 
BaseParquetReaderTest
         flattenSpec
     );
 
+    /*
+    The raw data in the parquet file has the following columns:
+    ############ Column(i32_dec) ############
+    name: i32_dec
+    path: i32_dec
+    max_definition_level: 1
+    max_repetition_level: 0
+    physical_type: INT32
+    logical_type: Decimal(precision=5, scale=2)
+    converted_type (legacy): DECIMAL
+
+    The raw data in the parquet file has the following rows:
+    0
+    1.00
+    2.00
+    3.00
+    4.00
+    5.00
+    6.00
+    7.00
+    8.00
+    9.00
+    0
+    1.00
+    2.00
+    3.00
+    4.00
+    5.00
+     */
+
     List<InputRow> rows = readAllRows(reader);
     Assert.assertEquals("2018-09-01T00:00:00.000Z", 
rows.get(1).getTimestamp().toString());
-    Assert.assertEquals("100", rows.get(1).getDimension("i32_dec").get(0));
-    Assert.assertEquals(new BigDecimal(100), rows.get(1).getMetric("metric1"));
+    Assert.assertEquals("1.00", rows.get(1).getDimension("i32_dec").get(0));
+    Assert.assertEquals(BigDecimal.valueOf(100L, 2), 
rows.get(1).getMetric("metric1"));
 
     reader = createReader(
         file,
@@ -112,7 +172,7 @@ public class DecimalParquetReaderTest extends 
BaseParquetReaderTest
     );
     List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
     final String expectedJson = "{\n"
-                                + "  \"i32_dec\" : 100\n"
+                                + "  \"i32_dec\" : 1.00\n"
                                 + "}";
     Assert.assertEquals(expectedJson, 
DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(1).getRawValues()));
   }
@@ -137,10 +197,40 @@ public class DecimalParquetReaderTest extends 
BaseParquetReaderTest
         flattenSpec
     );
 
+    /*
+    The raw data in the parquet file has the following columns:
+    ############ Column(i64_dec) ############
+    name: i64_dec
+    path: i64_dec
+    max_definition_level: 1
+    max_repetition_level: 0
+    physical_type: INT64
+    logical_type: Decimal(precision=10, scale=2)
+    converted_type (legacy): DECIMAL
+
+    The raw data in the parquet file has the following rows:
+    0
+    1.00
+    2.00
+    3.00
+    4.00
+    5.00
+    6.00
+    7.00
+    8.00
+    9.00
+    0
+    1.00
+    2.00
+    3.00
+    4.00
+    5.00
+     */
+
     List<InputRow> rows = readAllRows(reader);
     Assert.assertEquals("2018-09-01T00:00:00.000Z", 
rows.get(1).getTimestamp().toString());
-    Assert.assertEquals("100", rows.get(1).getDimension("i64_dec").get(0));
-    Assert.assertEquals(new BigDecimal(100), rows.get(1).getMetric("metric1"));
+    Assert.assertEquals("1.00", rows.get(1).getDimension("i64_dec").get(0));
+    Assert.assertEquals(BigDecimal.valueOf(100L, 2), 
rows.get(1).getMetric("metric1"));
 
     reader = createReader(
         file,
@@ -149,7 +239,7 @@ public class DecimalParquetReaderTest extends 
BaseParquetReaderTest
     );
     List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
     final String expectedJson = "{\n"
-                                + "  \"i64_dec\" : 100\n"
+                                + "  \"i64_dec\" : 1.00\n"
                                 + "}";
     Assert.assertEquals(expectedJson, 
DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(1).getRawValues()));
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to