This is an automated email from the ASF dual-hosted git repository.
maytasm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git
The following commit(s) were added to refs/heads/master by this push:
new 55acf2e2ff4 Fix incorrect scale when reading decimal from parquet
(#15715)
55acf2e2ff4 is described below
commit 55acf2e2ff49cb14eee353a474805746368a877d
Author: Maytas Monsereenusorn <[email protected]>
AuthorDate: Thu Jan 18 02:10:27 2024 -0800
Fix incorrect scale when reading decimal from parquet (#15715)
* Fix incorrect scale when reading decimal from parquet
* add comments
* fix test
---
.../parquet/simple/ParquetGroupConverter.java | 8 +-
.../input/parquet/DecimalParquetInputTest.java | 95 ++++++++++++++++++-
.../input/parquet/DecimalParquetReaderTest.java | 102 +++++++++++++++++++--
3 files changed, 193 insertions(+), 12 deletions(-)
diff --git
a/extensions-core/parquet-extensions/src/main/java/org/apache/druid/data/input/parquet/simple/ParquetGroupConverter.java
b/extensions-core/parquet-extensions/src/main/java/org/apache/druid/data/input/parquet/simple/ParquetGroupConverter.java
index 4571da3d724..8700359cf79 100644
---
a/extensions-core/parquet-extensions/src/main/java/org/apache/druid/data/input/parquet/simple/ParquetGroupConverter.java
+++
b/extensions-core/parquet-extensions/src/main/java/org/apache/druid/data/input/parquet/simple/ParquetGroupConverter.java
@@ -426,9 +426,13 @@ public class ParquetGroupConverter
int scale = pt.asPrimitiveType().getDecimalMetadata().getScale();
switch (pt.getPrimitiveTypeName()) {
case INT32:
- return new BigDecimal(g.getInteger(fieldIndex, index));
+ // The primitive returned from Group is an unscaledValue.
+ // We need to do unscaledValue * 10^(-scale) to convert back
to decimal
+ return new BigDecimal(g.getInteger(fieldIndex,
index)).movePointLeft(scale);
case INT64:
- return new BigDecimal(g.getLong(fieldIndex, index));
+ // The primitive returned from Group is an unscaledValue.
+ // We need to do unscaledValue * 10^(-scale) to convert back
to decimal
+ return new BigDecimal(g.getLong(fieldIndex,
index)).movePointLeft(scale);
case FIXED_LEN_BYTE_ARRAY:
case BINARY:
Binary value = g.getBinary(fieldIndex, index);
diff --git
a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetInputTest.java
b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetInputTest.java
index 7f60fbf066f..37c99f53baf 100644
---
a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetInputTest.java
+++
b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetInputTest.java
@@ -62,6 +62,35 @@ public class DecimalParquetInputTest extends
BaseParquetInputTest
parserType,
true
);
+ /*
+ The raw data in the parquet file has the following columns:
+ ############ Column(fixed_len_dec) ############
+ name: fixed_len_dec
+ path: fixed_len_dec
+ max_definition_level: 1
+ max_repetition_level: 0
+ physical_type: FIXED_LEN_BYTE_ARRAY
+ logical_type: Decimal(precision=10, scale=2)
+ converted_type (legacy): DECIMAL
+
+ The raw data in the parquet file has the following rows:
+ 0.0
+ 1.0
+ 2.0
+ 3.0
+ 4.0
+ 5.0
+ 6.0
+ 7.0
+ 8.0
+ 9.0
+ 0.0
+ 1.0
+ 2.0
+ 3.0
+ 4.0
+ 5.0
+ */
List<InputRow> rows = getAllRows(parserType, config);
Assert.assertEquals("2018-09-01T00:00:00.000Z",
rows.get(0).getTimestamp().toString());
Assert.assertEquals("1.0",
rows.get(0).getDimension("fixed_len_dec").get(0));
@@ -80,10 +109,39 @@ public class DecimalParquetInputTest extends
BaseParquetInputTest
parserType,
true
);
+ /*
+ The raw data in the parquet file has the following columns:
+ ############ Column(i32_dec) ############
+ name: i32_dec
+ path: i32_dec
+ max_definition_level: 1
+ max_repetition_level: 0
+ physical_type: INT32
+ logical_type: Decimal(precision=5, scale=2)
+ converted_type (legacy): DECIMAL
+
+ The raw data in the parquet file has the following rows:
+ 0
+ 1.00
+ 2.00
+ 3.00
+ 4.00
+ 5.00
+ 6.00
+ 7.00
+ 8.00
+ 9.00
+ 0
+ 1.00
+ 2.00
+ 3.00
+ 4.00
+ 5.00
+ */
List<InputRow> rows = getAllRows(parserType, config);
Assert.assertEquals("2018-09-01T00:00:00.000Z",
rows.get(0).getTimestamp().toString());
- Assert.assertEquals("100", rows.get(0).getDimension("i32_dec").get(0));
- Assert.assertEquals(new BigDecimal(100), rows.get(0).getMetric("metric1"));
+ Assert.assertEquals("1.00", rows.get(0).getDimension("i32_dec").get(0));
+ Assert.assertEquals(BigDecimal.valueOf(100L, 2),
rows.get(0).getMetric("metric1"));
}
@Test
@@ -98,9 +156,38 @@ public class DecimalParquetInputTest extends
BaseParquetInputTest
parserType,
true
);
+ /*
+ The raw data in the parquet file has the following columns:
+ ############ Column(i64_dec) ############
+ name: i64_dec
+ path: i64_dec
+ max_definition_level: 1
+ max_repetition_level: 0
+ physical_type: INT64
+ logical_type: Decimal(precision=10, scale=2)
+ converted_type (legacy): DECIMAL
+
+ The raw data in the parquet file has the following rows:
+ 0
+ 1.00
+ 2.00
+ 3.00
+ 4.00
+ 5.00
+ 6.00
+ 7.00
+ 8.00
+ 9.00
+ 0
+ 1.00
+ 2.00
+ 3.00
+ 4.00
+ 5.00
+ */
List<InputRow> rows = getAllRows(parserType, config);
Assert.assertEquals("2018-09-01T00:00:00.000Z",
rows.get(0).getTimestamp().toString());
- Assert.assertEquals("100", rows.get(0).getDimension("i64_dec").get(0));
- Assert.assertEquals(new BigDecimal(100), rows.get(0).getMetric("metric1"));
+ Assert.assertEquals("1.00", rows.get(0).getDimension("i64_dec").get(0));
+ Assert.assertEquals(BigDecimal.valueOf(100L, 2),
rows.get(0).getMetric("metric1"));
}
}
diff --git
a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java
b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java
index faa80e6d73f..0d56e1e3652 100644
---
a/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java
+++
b/extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/DecimalParquetReaderTest.java
@@ -63,6 +63,36 @@ public class DecimalParquetReaderTest extends
BaseParquetReaderTest
flattenSpec
);
+ /*
+ The raw data in the parquet file has the following columns:
+ ############ Column(fixed_len_dec) ############
+ name: fixed_len_dec
+ path: fixed_len_dec
+ max_definition_level: 1
+ max_repetition_level: 0
+ physical_type: FIXED_LEN_BYTE_ARRAY
+ logical_type: Decimal(precision=10, scale=2)
+ converted_type (legacy): DECIMAL
+
+ The raw data in the parquet file has the following rows:
+ 0.0
+ 1.0
+ 2.0
+ 3.0
+ 4.0
+ 5.0
+ 6.0
+ 7.0
+ 8.0
+ 9.0
+ 0.0
+ 1.0
+ 2.0
+ 3.0
+ 4.0
+ 5.0
+ */
+
List<InputRow> rows = readAllRows(reader);
Assert.assertEquals("2018-09-01T00:00:00.000Z",
rows.get(1).getTimestamp().toString());
Assert.assertEquals("1.0",
rows.get(1).getDimension("fixed_len_dec").get(0));
@@ -100,10 +130,40 @@ public class DecimalParquetReaderTest extends
BaseParquetReaderTest
flattenSpec
);
+ /*
+ The raw data in the parquet file has the following columns:
+ ############ Column(i32_dec) ############
+ name: i32_dec
+ path: i32_dec
+ max_definition_level: 1
+ max_repetition_level: 0
+ physical_type: INT32
+ logical_type: Decimal(precision=5, scale=2)
+ converted_type (legacy): DECIMAL
+
+ The raw data in the parquet file has the following rows:
+ 0
+ 1.00
+ 2.00
+ 3.00
+ 4.00
+ 5.00
+ 6.00
+ 7.00
+ 8.00
+ 9.00
+ 0
+ 1.00
+ 2.00
+ 3.00
+ 4.00
+ 5.00
+ */
+
List<InputRow> rows = readAllRows(reader);
Assert.assertEquals("2018-09-01T00:00:00.000Z",
rows.get(1).getTimestamp().toString());
- Assert.assertEquals("100", rows.get(1).getDimension("i32_dec").get(0));
- Assert.assertEquals(new BigDecimal(100), rows.get(1).getMetric("metric1"));
+ Assert.assertEquals("1.00", rows.get(1).getDimension("i32_dec").get(0));
+ Assert.assertEquals(BigDecimal.valueOf(100L, 2),
rows.get(1).getMetric("metric1"));
reader = createReader(
file,
@@ -112,7 +172,7 @@ public class DecimalParquetReaderTest extends
BaseParquetReaderTest
);
List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
final String expectedJson = "{\n"
- + " \"i32_dec\" : 100\n"
+ + " \"i32_dec\" : 1.00\n"
+ "}";
Assert.assertEquals(expectedJson,
DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(1).getRawValues()));
}
@@ -137,10 +197,40 @@ public class DecimalParquetReaderTest extends
BaseParquetReaderTest
flattenSpec
);
+ /*
+ The raw data in the parquet file has the following columns:
+ ############ Column(i64_dec) ############
+ name: i64_dec
+ path: i64_dec
+ max_definition_level: 1
+ max_repetition_level: 0
+ physical_type: INT64
+ logical_type: Decimal(precision=10, scale=2)
+ converted_type (legacy): DECIMAL
+
+ The raw data in the parquet file has the following rows:
+ 0
+ 1.00
+ 2.00
+ 3.00
+ 4.00
+ 5.00
+ 6.00
+ 7.00
+ 8.00
+ 9.00
+ 0
+ 1.00
+ 2.00
+ 3.00
+ 4.00
+ 5.00
+ */
+
List<InputRow> rows = readAllRows(reader);
Assert.assertEquals("2018-09-01T00:00:00.000Z",
rows.get(1).getTimestamp().toString());
- Assert.assertEquals("100", rows.get(1).getDimension("i64_dec").get(0));
- Assert.assertEquals(new BigDecimal(100), rows.get(1).getMetric("metric1"));
+ Assert.assertEquals("1.00", rows.get(1).getDimension("i64_dec").get(0));
+ Assert.assertEquals(BigDecimal.valueOf(100L, 2),
rows.get(1).getMetric("metric1"));
reader = createReader(
file,
@@ -149,7 +239,7 @@ public class DecimalParquetReaderTest extends
BaseParquetReaderTest
);
List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
final String expectedJson = "{\n"
- + " \"i64_dec\" : 100\n"
+ + " \"i64_dec\" : 1.00\n"
+ "}";
Assert.assertEquals(expectedJson,
DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(1).getRawValues()));
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]