[ 
https://issues.apache.org/jira/browse/HIVE-18763?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Sergey Shelukhin updated HIVE-18763:
------------------------------------
    Description: 
When table and partition schema differ, non-vectorized MapOperator does row by 
row conversion from whatever is read to the table schema.
VectorMapOperator is less consistent... it does the conversion as part of 
populating VRBs in row/serde modes (used to read e.g. text row-by-row or 
natively, and make VRBs); see  VectorDeserializeRow class convert... methods 
for an example. However, the native VRB mode relies on ORC ConvertTreeReader... 
stuff that lives in ORC, and so never converts anything nside VMO.

So, anything running in native VRB mode that is not the vanilla ORC reader will 
produce data with incorrect schema - there are two such cases right now, LLAP 
IO with ORC or text data, and Parquet. 
It's possible to extend ConvertTreeReader... stuff to LLAP IO ORC that already 
uses TreeReader-s for everything; LLAP IO text and Parquet however will have to 
invent their own conversion.
Therefore, I think the best fix for this is to treat all inputs in VMO the same 
and convert them by default, like the regular MapOperator; and make ORC special 
mode an exception that allows it to bypass the conversion. 
cc [~mmccline]

Test case - varchar column length should be limited after alter table but it 
isn't.
{noformat}
CREATE TABLE schema_evolution_data(insert_num int, boolean1 boolean, tinyint1 
tinyint, smallint1 smallint, int1 int, bigint1 bigint, decimal1 decimal(38,18), 
float1 float, double1 double, string1 varchar(50), string2 varchar(50), date1 
date, timestamp1 timestamp, boolean_str string, tinyint_str string, 
smallint_str string, int_str string, bigint_str string, decimal_str string, 
float_str string, double_str string, date_str string, timestamp_str string, 
filler string)
row format delimited fields terminated by '|' stored as textfile;
load data local inpath 
'../../data/files/schema_evolution/schema_evolution_data.txt' overwrite into 
table schema_evolution_data;


drop table if exists vsp;
create table vsp(vs varchar(50)) partitioned by(s varchar(50)) stored as 
textfile;
insert into table vsp partition(s='positive') select string1 from 
schema_evolution_data;
alter table vsp change column vs vs varchar(3);

drop table if exists vsp_orc;
create table vsp_orc(vs varchar(50)) partitioned by(s varchar(50)) stored as 
orc;
insert into table vsp_orc partition(s='positive') select string1 from 
schema_evolution_data;
alter table vsp_orc change column vs vs varchar(3);

drop table if exists vsp_parquet;
create table vsp_parquet(vs varchar(50)) partitioned by(s varchar(50)) stored 
as parquet;
insert into table vsp_parquet partition(s='positive') select string1 from 
schema_evolution_data;
alter table vsp_parquet change column vs vs varchar(3);

SET hive.llap.io.enabled=true;
-- BAD results from all queries; parquet affected regardless of IO.
select length(vs) from vsp; 
select length(vs) from vsp_orc;
select length(vs) from vsp_parquet;

SET hive.llap.io.enabled=false;
select length(vs) from vsp; -- ok
select length(vs) from vsp_orc; -- ok
select length(vs) from vsp_parquet; -- still bad
{noformat}

  was:
When table and partition schema differ, non-vectorized MapOperator does row by 
row conversion from whatever is read to the table schema.
VectorMapOperator is less consistent... it does the conversion as part of 
populating VRBs in row/serde modes (used to read e.g. text row-by-row or 
natively, and make VRBs); see  VectorDeserializeRow class convert... methods 
for an example. However, the native VRB mode relies on ORC ConvertTreeReader... 
stuff that lives in ORC and never converts anything.

So, anything running in native VRB mode that is not the vanilla ORC reader will 
produce data with incorrect schema - there are two such cases right now, LLAP 
IO with ORC or text data, and Parquet. 
It's possible to extend ConvertTreeReader... stuff to LLAP IO ORC that already 
uses TreeReader-s for everything; LLAP IO text and Parquet however will have to 
invent their own conversion.
Therefore, I think the best fix for this is to treat all inputs in VMO the same 
and convert them by default, like the regular MapOperator; and make ORC special 
mode an exception that allows it to bypass the conversion. 
cc [~mmccline]

Test case - varchar column length should be limited after alter table but it 
isn't.
{noformat}
CREATE TABLE schema_evolution_data(insert_num int, boolean1 boolean, tinyint1 
tinyint, smallint1 smallint, int1 int, bigint1 bigint, decimal1 decimal(38,18), 
float1 float, double1 double, string1 varchar(50), string2 varchar(50), date1 
date, timestamp1 timestamp, boolean_str string, tinyint_str string, 
smallint_str string, int_str string, bigint_str string, decimal_str string, 
float_str string, double_str string, date_str string, timestamp_str string, 
filler string)
row format delimited fields terminated by '|' stored as textfile;
load data local inpath 
'../../data/files/schema_evolution/schema_evolution_data.txt' overwrite into 
table schema_evolution_data;


drop table if exists vsp;
create table vsp(vs varchar(50)) partitioned by(s varchar(50)) stored as 
textfile;
insert into table vsp partition(s='positive') select string1 from 
schema_evolution_data;
alter table vsp change column vs vs varchar(3);

drop table if exists vsp_orc;
create table vsp_orc(vs varchar(50)) partitioned by(s varchar(50)) stored as 
orc;
insert into table vsp_orc partition(s='positive') select string1 from 
schema_evolution_data;
alter table vsp_orc change column vs vs varchar(3);

drop table if exists vsp_parquet;
create table vsp_parquet(vs varchar(50)) partitioned by(s varchar(50)) stored 
as parquet;
insert into table vsp_parquet partition(s='positive') select string1 from 
schema_evolution_data;
alter table vsp_parquet change column vs vs varchar(3);

SET hive.llap.io.enabled=true;
-- BAD results from all queries; parquet affected regardless of IO.
select length(vs) from vsp; 
select length(vs) from vsp_orc;
select length(vs) from vsp_parquet;

SET hive.llap.io.enabled=false;
select length(vs) from vsp; -- ok
select length(vs) from vsp_orc; -- ok
select length(vs) from vsp_parquet; -- still bad
{noformat}


> VectorMapOperator should take into account partition->table serde conversion 
> for all cases
> ------------------------------------------------------------------------------------------
>
>                 Key: HIVE-18763
>                 URL: https://issues.apache.org/jira/browse/HIVE-18763
>             Project: Hive
>          Issue Type: Bug
>            Reporter: Sergey Shelukhin
>            Priority: Major
>
> When table and partition schema differ, non-vectorized MapOperator does row 
> by row conversion from whatever is read to the table schema.
> VectorMapOperator is less consistent... it does the conversion as part of 
> populating VRBs in row/serde modes (used to read e.g. text row-by-row or 
> natively, and make VRBs); see  VectorDeserializeRow class convert... methods 
> for an example. However, the native VRB mode relies on ORC 
> ConvertTreeReader... stuff that lives in ORC, and so never converts anything 
> nside VMO.
> So, anything running in native VRB mode that is not the vanilla ORC reader 
> will produce data with incorrect schema - there are two such cases right now, 
> LLAP IO with ORC or text data, and Parquet. 
> It's possible to extend ConvertTreeReader... stuff to LLAP IO ORC that 
> already uses TreeReader-s for everything; LLAP IO text and Parquet however 
> will have to invent their own conversion.
> Therefore, I think the best fix for this is to treat all inputs in VMO the 
> same and convert them by default, like the regular MapOperator; and make ORC 
> special mode an exception that allows it to bypass the conversion. 
> cc [~mmccline]
> Test case - varchar column length should be limited after alter table but it 
> isn't.
> {noformat}
> CREATE TABLE schema_evolution_data(insert_num int, boolean1 boolean, tinyint1 
> tinyint, smallint1 smallint, int1 int, bigint1 bigint, decimal1 
> decimal(38,18), float1 float, double1 double, string1 varchar(50), string2 
> varchar(50), date1 date, timestamp1 timestamp, boolean_str string, 
> tinyint_str string, smallint_str string, int_str string, bigint_str string, 
> decimal_str string, float_str string, double_str string, date_str string, 
> timestamp_str string, filler string)
> row format delimited fields terminated by '|' stored as textfile;
> load data local inpath 
> '../../data/files/schema_evolution/schema_evolution_data.txt' overwrite into 
> table schema_evolution_data;
> drop table if exists vsp;
> create table vsp(vs varchar(50)) partitioned by(s varchar(50)) stored as 
> textfile;
> insert into table vsp partition(s='positive') select string1 from 
> schema_evolution_data;
> alter table vsp change column vs vs varchar(3);
> drop table if exists vsp_orc;
> create table vsp_orc(vs varchar(50)) partitioned by(s varchar(50)) stored as 
> orc;
> insert into table vsp_orc partition(s='positive') select string1 from 
> schema_evolution_data;
> alter table vsp_orc change column vs vs varchar(3);
> drop table if exists vsp_parquet;
> create table vsp_parquet(vs varchar(50)) partitioned by(s varchar(50)) stored 
> as parquet;
> insert into table vsp_parquet partition(s='positive') select string1 from 
> schema_evolution_data;
> alter table vsp_parquet change column vs vs varchar(3);
> SET hive.llap.io.enabled=true;
> -- BAD results from all queries; parquet affected regardless of IO.
> select length(vs) from vsp; 
> select length(vs) from vsp_orc;
> select length(vs) from vsp_parquet;
> SET hive.llap.io.enabled=false;
> select length(vs) from vsp; -- ok
> select length(vs) from vsp_orc; -- ok
> select length(vs) from vsp_parquet; -- still bad
> {noformat}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to