This is an automated email from the ASF dual-hosted git repository.
soumyakantidas pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 5c1a99aa83a HIVE-26877: Parquet CTAS with JOIN on decimals with
different precision/scale fail (#6274)
5c1a99aa83a is described below
commit 5c1a99aa83a1c93dadcdfba8360445f8fc376756
Author: Dayakar M <[email protected]>
AuthorDate: Sat Jan 24 11:53:54 2026 +0530
HIVE-26877: Parquet CTAS with JOIN on decimals with different
precision/scale fail (#6274)
---
.../ql/io/parquet/write/DataWritableWriter.java | 34 +++-
.../parquet_join_dec_col_diff_pre_scale.q | 21 ++
.../llap/parquet_join_dec_col_diff_pre_scale.q.out | 224 +++++++++++++++++++++
3 files changed, 275 insertions(+), 4 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java
index 6915c08f58f..7d3e3fcb245 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java
@@ -45,10 +45,12 @@
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.RecordConsumer;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
+import
org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation;
import
org.apache.parquet.schema.LogicalTypeAnnotation.ListLogicalTypeAnnotation;
import
org.apache.parquet.schema.LogicalTypeAnnotation.MapLogicalTypeAnnotation;
import org.apache.parquet.schema.Type;
@@ -157,7 +159,8 @@ private DataWriter createWriter(ObjectInspector inspector,
Type type) {
case TIMESTAMP:
return new TimestampDataWriter((TimestampObjectInspector)inspector);
case DECIMAL:
- return new DecimalDataWriter((HiveDecimalObjectInspector)inspector);
+ return new DecimalDataWriter((HiveDecimalObjectInspector) inspector,
+ getSchemaDecimalTypeInfo(type, (HiveDecimalObjectInspector)
inspector));
case DATE:
return new DateDataWriter((DateObjectInspector)inspector);
default:
@@ -180,6 +183,22 @@ private DataWriter createWriter(ObjectInspector inspector,
Type type) {
}
}
+ /**
+ * Return the decimal type information defined by the Parquet schema. This
ensures the writer
+ * uses the declared precision/scale.
+ * @param type Type that contains information about the type schema.
+ * @param inspector The object inspector used to get the value type.
+ * @return DecimalTypeInfo The decimal type info object with proper
precision and scale.
+ */
+ private DecimalTypeInfo getSchemaDecimalTypeInfo(Type type,
HiveDecimalObjectInspector inspector) {
+ LogicalTypeAnnotation logicalType = type.getLogicalTypeAnnotation();
+ if (logicalType instanceof DecimalLogicalTypeAnnotation decimal) {
+ return new DecimalTypeInfo(decimal.getPrecision(), decimal.getScale());
+ }
+ // Fallback to the inspector's type info if the schema does not carry the
logical annotation.
+ return (DecimalTypeInfo) inspector.getTypeInfo();
+ }
+
/**
* Checks that an inspector matches the category indicated as a parameter.
* @param inspector The object inspector to check
@@ -559,16 +578,23 @@ boolean isValidTimestamp(Object fieldValue) {
private class DecimalDataWriter implements DataWriter {
private HiveDecimalObjectInspector inspector;
+ private final DecimalTypeInfo schemaDecimalTypeInfo;
- public DecimalDataWriter(HiveDecimalObjectInspector inspector) {
+ public DecimalDataWriter(HiveDecimalObjectInspector inspector,
DecimalTypeInfo schemaDecimalTypeInfo) {
this.inspector = inspector;
+ this.schemaDecimalTypeInfo = schemaDecimalTypeInfo;
}
@Override
public void write(Object value) {
HiveDecimal vDecimal = inspector.getPrimitiveJavaObject(value);
- DecimalTypeInfo decTypeInfo = (DecimalTypeInfo)inspector.getTypeInfo();
- recordConsumer.addBinary(decimalToBinary(vDecimal, decTypeInfo));
+ // Enforce the Parquet schema precision/scale before converting to
binary to avoid size mismatches.
+ HiveDecimal enforcedDecimal =
HiveDecimalUtils.enforcePrecisionScale(vDecimal, schemaDecimalTypeInfo);
+ if (enforcedDecimal == null) {
+ throw new RuntimeException(
+ "Decimal value " + vDecimal + " does not fit in declared type " +
schemaDecimalTypeInfo.getQualifiedName());
+ }
+ recordConsumer.addBinary(decimalToBinary(vDecimal,
schemaDecimalTypeInfo));
}
private Binary decimalToBinary(final HiveDecimal hiveDecimal, final
DecimalTypeInfo decimalTypeInfo) {
diff --git
a/ql/src/test/queries/clientpositive/parquet_join_dec_col_diff_pre_scale.q
b/ql/src/test/queries/clientpositive/parquet_join_dec_col_diff_pre_scale.q
new file mode 100644
index 00000000000..779bf19e2d6
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/parquet_join_dec_col_diff_pre_scale.q
@@ -0,0 +1,21 @@
+CREATE TABLE table_a (col_dec_a decimal(12,7));
+CREATE TABLE table_b (col_dec_b decimal(15,5));
+INSERT INTO table_a VALUES (12345.6789101);
+INSERT INTO table_b VALUES (1234567891.01112);
+
+set hive.default.fileformat=parquet;
+
+explain create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b;
+
+create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b;
+
+desc target;
+select * from target;
diff --git
a/ql/src/test/results/clientpositive/llap/parquet_join_dec_col_diff_pre_scale.q.out
b/ql/src/test/results/clientpositive/llap/parquet_join_dec_col_diff_pre_scale.q.out
new file mode 100644
index 00000000000..e416c28fb43
--- /dev/null
+++
b/ql/src/test/results/clientpositive/llap/parquet_join_dec_col_diff_pre_scale.q.out
@@ -0,0 +1,224 @@
+PREHOOK: query: CREATE TABLE table_a (col_dec_a decimal(12,7))
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@table_a
+POSTHOOK: query: CREATE TABLE table_a (col_dec_a decimal(12,7))
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@table_a
+PREHOOK: query: CREATE TABLE table_b (col_dec_b decimal(15,5))
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@table_b
+POSTHOOK: query: CREATE TABLE table_b (col_dec_b decimal(15,5))
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@table_b
+PREHOOK: query: INSERT INTO table_a VALUES (12345.6789101)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@table_a
+POSTHOOK: query: INSERT INTO table_a VALUES (12345.6789101)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@table_a
+POSTHOOK: Lineage: table_a.col_dec_a SCRIPT []
+PREHOOK: query: INSERT INTO table_b VALUES (1234567891.01112)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@table_b
+POSTHOOK: query: INSERT INTO table_b VALUES (1234567891.01112)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@table_b
+POSTHOOK: Lineage: table_b.col_dec_b SCRIPT []
+PREHOOK: query: explain create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@table_a
+PREHOOK: Input: default@table_b
+PREHOOK: Output: database:default
+PREHOOK: Output: default@target
+POSTHOOK: query: explain create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@table_a
+POSTHOOK: Input: default@table_b
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@target
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-4 depends on stages: Stage-0, Stage-2
+ Stage-3 depends on stages: Stage-4
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: table_a
+ Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: col_dec_a (type: decimal(12,7))
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 112 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: decimal(17,7))
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: decimal(17,7))
+ Statistics: Num rows: 1 Data size: 112 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 4
+ Map Operator Tree:
+ TableScan
+ alias: table_b
+ filterExpr: col_dec_b is not null (type: boolean)
+ Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE
Column stats: COMPLETE
+ Filter Operator
+ predicate: col_dec_b is not null (type: boolean)
+ Statistics: Num rows: 1 Data size: 112 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: col_dec_b (type: decimal(15,5))
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 112 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: decimal(17,7))
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type:
decimal(17,7))
+ Statistics: Num rows: 1 Data size: 112 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Left Outer Join 0 to 1
+ keys:
+ 0 _col0 (type: decimal(17,7))
+ 1 _col0 (type: decimal(17,7))
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE
Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat
+ serde:
org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
+ name: default.target
+ Select Operator
+ expressions: _col0 (type: decimal(12,7))
+ outputColumnNames: col1
+ Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE
Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(col1), max(col1), count(1), count(col1),
compute_bit_vector_hll(col1)
+ minReductionHashAggr: 0.4
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Statistics: Num rows: 1 Data size: 384 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ null sort order:
+ sort order:
+ Statistics: Num rows: 1 Data size: 384 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: decimal(12,7)), _col1
(type: decimal(12,7)), _col2 (type: bigint), _col3 (type: bigint), _col4 (type:
binary)
+ Reducer 3
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1),
count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: 'DECIMAL' (type: string), _col0 (type:
decimal(12,7)), _col1 (type: decimal(12,7)), (_col2 - _col3) (type: bigint),
COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary)
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ Statistics: Num rows: 1 Data size: 475 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 475 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-2
+ Dependency Collection
+
+ Stage: Stage-4
+ Create Table
+ columns: target_col decimal(12,7)
+ name: default.target
+ input format:
org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat
+ serde name: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
+
+ Stage: Stage-3
+ Stats Work
+ Basic Stats Work:
+ Column Stats Desc:
+ Columns: target_col
+ Column Types: decimal(12,7)
+ Table: default.target
+
+ Stage: Stage-0
+ Move Operator
+ files:
+ hdfs directory: true
+#### A masked pattern was here ####
+
+PREHOOK: query: create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@table_a
+PREHOOK: Input: default@table_b
+PREHOOK: Output: database:default
+PREHOOK: Output: default@target
+POSTHOOK: query: create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@table_a
+POSTHOOK: Input: default@table_b
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@target
+POSTHOOK: Lineage: target.target_col SIMPLE
[(table_a)table_a.FieldSchema(name:col_dec_a, type:decimal(12,7),
comment:null), ]
+PREHOOK: query: desc target
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@target
+POSTHOOK: query: desc target
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@target
+target_col decimal(12,7)
+PREHOOK: query: select * from target
+PREHOOK: type: QUERY
+PREHOOK: Input: default@target
+#### A masked pattern was here ####
+POSTHOOK: query: select * from target
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@target
+#### A masked pattern was here ####
+12345.6789101