This is an automated email from the ASF dual-hosted git repository.

soumyakantidas pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 5c1a99aa83a HIVE-26877: Parquet CTAS with JOIN on decimals with 
different precision/scale fail (#6274)
5c1a99aa83a is described below

commit 5c1a99aa83a1c93dadcdfba8360445f8fc376756
Author: Dayakar M <[email protected]>
AuthorDate: Sat Jan 24 11:53:54 2026 +0530

    HIVE-26877: Parquet CTAS with JOIN on decimals with different 
precision/scale fail (#6274)
---
 .../ql/io/parquet/write/DataWritableWriter.java    |  34 +++-
 .../parquet_join_dec_col_diff_pre_scale.q          |  21 ++
 .../llap/parquet_join_dec_col_diff_pre_scale.q.out | 224 +++++++++++++++++++++
 3 files changed, 275 insertions(+), 4 deletions(-)

diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java
index 6915c08f58f..7d3e3fcb245 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java
@@ -45,10 +45,12 @@
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
 import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils;
 import org.apache.parquet.io.api.Binary;
 import org.apache.parquet.io.api.RecordConsumer;
 import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.LogicalTypeAnnotation;
+import 
org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation;
 import 
org.apache.parquet.schema.LogicalTypeAnnotation.ListLogicalTypeAnnotation;
 import 
org.apache.parquet.schema.LogicalTypeAnnotation.MapLogicalTypeAnnotation;
 import org.apache.parquet.schema.Type;
@@ -157,7 +159,8 @@ private DataWriter createWriter(ObjectInspector inspector, 
Type type) {
         case TIMESTAMP:
           return new TimestampDataWriter((TimestampObjectInspector)inspector);
         case DECIMAL:
-          return new DecimalDataWriter((HiveDecimalObjectInspector)inspector);
+          return new DecimalDataWriter((HiveDecimalObjectInspector) inspector,
+              getSchemaDecimalTypeInfo(type, (HiveDecimalObjectInspector) 
inspector));
         case DATE:
           return new DateDataWriter((DateObjectInspector)inspector);
         default:
@@ -180,6 +183,22 @@ private DataWriter createWriter(ObjectInspector inspector, 
Type type) {
     }
   }
 
+  /**
+   * Return the decimal type information defined by the Parquet schema. This 
ensures the writer
+   * uses the declared precision/scale.
+   * @param type Type that contains information about the type schema.
+   * @param inspector The object inspector used to get the value type.
+   * @return DecimalTypeInfo The decimal type info object with proper 
precision and scale.
+   */
+  private DecimalTypeInfo getSchemaDecimalTypeInfo(Type type, 
HiveDecimalObjectInspector inspector) {
+    LogicalTypeAnnotation logicalType = type.getLogicalTypeAnnotation();
+    if (logicalType instanceof DecimalLogicalTypeAnnotation decimal) {
+      return new DecimalTypeInfo(decimal.getPrecision(), decimal.getScale());
+    }
+    // Fallback to the inspector's type info if the schema does not carry the 
logical annotation.
+    return (DecimalTypeInfo) inspector.getTypeInfo();
+  }
+
   /**
    * Checks that an inspector matches the category indicated as a parameter.
    * @param inspector The object inspector to check
@@ -559,16 +578,23 @@ boolean isValidTimestamp(Object fieldValue) {
 
   private class DecimalDataWriter implements DataWriter {
     private HiveDecimalObjectInspector inspector;
+    private final DecimalTypeInfo schemaDecimalTypeInfo;
 
-    public DecimalDataWriter(HiveDecimalObjectInspector inspector) {
+    public DecimalDataWriter(HiveDecimalObjectInspector inspector, 
DecimalTypeInfo schemaDecimalTypeInfo) {
       this.inspector = inspector;
+      this.schemaDecimalTypeInfo = schemaDecimalTypeInfo;
     }
 
     @Override
     public void write(Object value) {
       HiveDecimal vDecimal = inspector.getPrimitiveJavaObject(value);
-      DecimalTypeInfo decTypeInfo = (DecimalTypeInfo)inspector.getTypeInfo();
-      recordConsumer.addBinary(decimalToBinary(vDecimal, decTypeInfo));
+      // Enforce the Parquet schema precision/scale before converting to 
binary to avoid size mismatches.
+      HiveDecimal enforcedDecimal = 
HiveDecimalUtils.enforcePrecisionScale(vDecimal, schemaDecimalTypeInfo);
+      if (enforcedDecimal == null) {
+        throw new RuntimeException(
+            "Decimal value " + vDecimal + " does not fit in declared type " + 
schemaDecimalTypeInfo.getQualifiedName());
+      }
+      recordConsumer.addBinary(decimalToBinary(vDecimal, 
schemaDecimalTypeInfo));
     }
 
     private Binary decimalToBinary(final HiveDecimal hiveDecimal, final 
DecimalTypeInfo decimalTypeInfo) {
diff --git 
a/ql/src/test/queries/clientpositive/parquet_join_dec_col_diff_pre_scale.q 
b/ql/src/test/queries/clientpositive/parquet_join_dec_col_diff_pre_scale.q
new file mode 100644
index 00000000000..779bf19e2d6
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/parquet_join_dec_col_diff_pre_scale.q
@@ -0,0 +1,21 @@
+CREATE TABLE table_a (col_dec_a decimal(12,7));
+CREATE TABLE table_b (col_dec_b decimal(15,5));
+INSERT INTO table_a VALUES (12345.6789101);
+INSERT INTO table_b VALUES (1234567891.01112);
+
+set hive.default.fileformat=parquet;
+
+explain create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b;
+
+create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b;
+
+desc target;
+select * from target;
diff --git 
a/ql/src/test/results/clientpositive/llap/parquet_join_dec_col_diff_pre_scale.q.out
 
b/ql/src/test/results/clientpositive/llap/parquet_join_dec_col_diff_pre_scale.q.out
new file mode 100644
index 00000000000..e416c28fb43
--- /dev/null
+++ 
b/ql/src/test/results/clientpositive/llap/parquet_join_dec_col_diff_pre_scale.q.out
@@ -0,0 +1,224 @@
+PREHOOK: query: CREATE TABLE table_a (col_dec_a decimal(12,7))
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@table_a
+POSTHOOK: query: CREATE TABLE table_a (col_dec_a decimal(12,7))
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@table_a
+PREHOOK: query: CREATE TABLE table_b (col_dec_b decimal(15,5))
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@table_b
+POSTHOOK: query: CREATE TABLE table_b (col_dec_b decimal(15,5))
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@table_b
+PREHOOK: query: INSERT INTO table_a VALUES (12345.6789101)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@table_a
+POSTHOOK: query: INSERT INTO table_a VALUES (12345.6789101)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@table_a
+POSTHOOK: Lineage: table_a.col_dec_a SCRIPT []
+PREHOOK: query: INSERT INTO table_b VALUES (1234567891.01112)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@table_b
+POSTHOOK: query: INSERT INTO table_b VALUES (1234567891.01112)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@table_b
+POSTHOOK: Lineage: table_b.col_dec_b SCRIPT []
+PREHOOK: query: explain create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@table_a
+PREHOOK: Input: default@table_b
+PREHOOK: Output: database:default
+PREHOOK: Output: default@target
+POSTHOOK: query: explain create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@table_a
+POSTHOOK: Input: default@table_b
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@target
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-4 depends on stages: Stage-0, Stage-2
+  Stage-3 depends on stages: Stage-4
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: table_a
+                  Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: col_dec_a (type: decimal(12,7))
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 112 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: _col0 (type: decimal(17,7))
+                      null sort order: z
+                      sort order: +
+                      Map-reduce partition columns: _col0 (type: decimal(17,7))
+                      Statistics: Num rows: 1 Data size: 112 Basic stats: 
COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Map 4 
+            Map Operator Tree:
+                TableScan
+                  alias: table_b
+                  filterExpr: col_dec_b is not null (type: boolean)
+                  Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Filter Operator
+                    predicate: col_dec_b is not null (type: boolean)
+                    Statistics: Num rows: 1 Data size: 112 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: col_dec_b (type: decimal(15,5))
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 1 Data size: 112 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: decimal(17,7))
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: 
decimal(17,7))
+                        Statistics: Num rows: 1 Data size: 112 Basic stats: 
COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Left Outer Join 0 to 1
+                keys:
+                  0 _col0 (type: decimal(17,7))
+                  1 _col0 (type: decimal(17,7))
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE 
Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  table:
+                      input format: 
org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat
+                      serde: 
org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
+                      name: default.target
+                Select Operator
+                  expressions: _col0 (type: decimal(12,7))
+                  outputColumnNames: col1
+                  Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Group By Operator
+                    aggregations: min(col1), max(col1), count(1), count(col1), 
compute_bit_vector_hll(col1)
+                    minReductionHashAggr: 0.4
+                    mode: hash
+                    outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                    Statistics: Num rows: 1 Data size: 384 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      null sort order: 
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 384 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: _col0 (type: decimal(12,7)), _col1 
(type: decimal(12,7)), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: 
binary)
+        Reducer 3 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Select Operator
+                  expressions: 'DECIMAL' (type: string), _col0 (type: 
decimal(12,7)), _col1 (type: decimal(12,7)), (_col2 - _col3) (type: bigint), 
COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary)
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+                  Statistics: Num rows: 1 Data size: 475 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 475 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-2
+    Dependency Collection
+
+  Stage: Stage-4
+    Create Table
+      columns: target_col decimal(12,7)
+      name: default.target
+      input format: 
org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
+      output format: 
org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat
+      serde name: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
+
+  Stage: Stage-3
+    Stats Work
+      Basic Stats Work:
+      Column Stats Desc:
+          Columns: target_col
+          Column Types: decimal(12,7)
+          Table: default.target
+
+  Stage: Stage-0
+    Move Operator
+      files:
+          hdfs directory: true
+#### A masked pattern was here ####
+
+PREHOOK: query: create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@table_a
+PREHOOK: Input: default@table_b
+PREHOOK: Output: database:default
+PREHOOK: Output: default@target
+POSTHOOK: query: create table target as
+select table_a.col_dec_a target_col
+from table_a
+left outer join table_b on
+table_a.col_dec_a = table_b.col_dec_b
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@table_a
+POSTHOOK: Input: default@table_b
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@target
+POSTHOOK: Lineage: target.target_col SIMPLE 
[(table_a)table_a.FieldSchema(name:col_dec_a, type:decimal(12,7), 
comment:null), ]
+PREHOOK: query: desc target
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@target
+POSTHOOK: query: desc target
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@target
+target_col             decimal(12,7)                               
+PREHOOK: query: select * from target
+PREHOOK: type: QUERY
+PREHOOK: Input: default@target
+#### A masked pattern was here ####
+POSTHOOK: query: select * from target
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@target
+#### A masked pattern was here ####
+12345.6789101

Reply via email to