Simhadri Govindappa created HIVE-28087:
------------------------------------------

             Summary: Hive Iceberg: Insert into partitioned table  fails if the 
data is not clustered
                 Key: HIVE-28087
                 URL: https://issues.apache.org/jira/browse/HIVE-28087
             Project: Hive
          Issue Type: Task
            Reporter: Simhadri Govindappa
            Assignee: Simhadri Govindappa
         Attachments: query-hive-377.csv

Insert into partitioned table fails with the following error if the data is not 
clustered.


{noformat}
Caused by: java.lang.IllegalStateException: Incoming records violate the writer 
assumption that records are clustered by spec and by partition within each 
spec. Either cluster the incoming records or switch to fanout writers.
Encountered records that belong to already closed files:
partition 'ts_month=2027-03' in spec [
  1000: ts_month: month(2)
]
    at org.apache.iceberg.io.ClusteredWriter.write(ClusteredWriter.java:96)
    at 
org.apache.iceberg.io.ClusteredDataWriter.write(ClusteredDataWriter.java:31)
    at 
org.apache.iceberg.mr.hive.writer.HiveIcebergRecordWriter.write(HiveIcebergRecordWriter.java:53)
    at 
org.apache.hadoop.hive.ql.exec.FileSinkOperator.process(FileSinkOperator.java:1181)
    at 
org.apache.hadoop.hive.ql.exec.vector.VectorFileSinkOperator.process(VectorFileSinkOperator.java:111)
    at org.apache.hadoop.hive.ql.exec.Operator.vectorForward(Operator.java:919)
    at 
org.apache.hadoop.hive.ql.exec.vector.VectorSelectOperator.process(VectorSelectOperator.java:158)
    at 
org.apache.hadoop.hive.ql.exec.tez.ReduceRecordSource.processVectorGroup(ReduceRecordSource.java:502)
    ... 20 more{noformat}


A simple repro, using the attached csv file: 
[^query-hive-377.csv]
{noformat}
create database t3;

use t3;

create table vector1k(
        t int,
        si int,
        i int,
        b bigint,
        f float,
        d double,
        dc decimal(38,18),
        bo boolean,
        s string,
        s2 string,
        ts timestamp,
        ts2 timestamp,
        dt date)
     row format delimited fields terminated by ',';

load data local inpath "/query-hive-377.csv" OVERWRITE into table vector1k; 


select * from vector1k; create table vectortab10k(
        t int,
        si int,
        i int,
        b bigint,
        f float,
        d double,
        dc decimal(38,18),
        bo boolean,
        s string,
        s2 string,
        ts timestamp,
        ts2 timestamp,
        dt date)
    stored by iceberg
    stored as orc;
    
insert into vectortab10k  select * from vector1k;select count(*) from 
vectortab10k limit 10;

create table partition_transform_4(t int, ts timestamp) partitioned by 
spec(month(ts)) stored by iceberg;

insert into table partition_transform_4 select t, ts from vectortab10k ;
{noformat}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to