[
https://issues.apache.org/jira/browse/HIVE-14044?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
David Nies updated HIVE-14044:
------------------------------
Description:
When {{\n}} characters are contained in Avro files that are used as data bases
for an external table, the result of {{SELECT}} queries may be corrupt. I
encountered this error when querying hive both from {{beeline}} and from JDBC.
h3. Steps to reproduce (used files are attached to ticket)
# Create an {{.avro}} file that contains newline characters in a value of a map:
{code}
avro-tools fromjson --schema-file test.schema test.json > test.avro
{code}
# Copy {{.avro}} file to HDFS
{code}
hdfs dfs -copyFromLocal test.avro /some/location/
{code}
# Create an external table in beeline containing this {{.avro}}:
{code}
beeline> CREATE EXTERNAL TABLE broken_newline_map
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION '/user/dnies/hive-test/broken-newline/db'
TBLPROPERTIES ('avro.schema.literal'='
{
"type" : "record",
"name" : "myEntry",
"namespace" : "myNamespace",
"fields" : [ {
"name" : "foo",
"type" : "long"
}, {
"name" : "bar",
"type" : {
"type" : "map",
"values" : "string"
}
} ]
}
');
{code}
# Now, selecting may return corrupt results:
{code}
jdbc:hive2://my-server:10000/> select * from broken_newline_map;
+-------------------------+---------------------------------------------------+--+
| broken_newline_map.foo | broken_newline_map.bar |
+-------------------------+---------------------------------------------------+--+
| 1 | {"key2":"value2","key1":"value1\nafter newline"} |
| 2 | {"key2":"new value2","key1":"new value"} |
+-------------------------+---------------------------------------------------+--+
2 rows selected (1.661 seconds)
jdbc:hive2://my-server:10000/> select foo, map_keys(bar), map_values(bar) from
broken_newline_map;
+-------+------------------+-----------------------------+--+
| foo | _c1 | _c2 |
+-------+------------------+-----------------------------+--+
| 1 | ["key2","key1"] | ["value2","value1"] |
| NULL | NULL | NULL |
| 2 | ["key2","key1"] | ["new value2","new value"] |
+-------+------------------+-----------------------------+--+
3 rows selected (28.05 seconds)
{code}
Obviously, the last result set contains corrupt entries (line 2). I also
encountered this when doing this query with JDBC.
was:
When {{\n}} characters are contained in Avro files that are used as data bases
for an external table, the result of {{SELECT}} queries may be corrupt. I
encountered this error when querying hive both from {{beeline}} and from JDBC.
h3. Steps to reproduce (used files are attached to ticket)
# Create an {{.avro}} file that contains newline characters in a value of a map:
{code}
avro-tools fromjson --schema-file test.schema test.json > test.avro
{code}
# Copy {{.avro}} file to HDFS
{code}
hdfs dfs -copyFromLocal test.avro /some/location/
{code}
# Create an external table in beeline containing this {{.avro}}:
{code}
beeline> CREATE EXTERNAL TABLE broken_newline_map
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION '/user/dnies/hive-test/broken-newline/db'
TBLPROPERTIES ('avro.schema.literal'='
{
"type" : "record",
"name" : "myEntry",
"namespace" : "myNamespace",
"fields" : [ {
"name" : "foo",
"type" : "long"
}, {
"name" : "bar",
"type" : {
"type" : "map",
"values" : "string"
}
} ]
}
');
{code}
# Now, selecting may return corrupt results:
{code}
jdbc:my-server:10000/> select * from broken_newline_map;
+-------------------------+---------------------------------------------------+--+
| broken_newline_map.foo | broken_newline_map.bar |
+-------------------------+---------------------------------------------------+--+
| 1 | {"key2":"value2","key1":"value1\nafter newline"} |
| 2 | {"key2":"new value2","key1":"new value"} |
+-------------------------+---------------------------------------------------+--+
2 rows selected (1.661 seconds)
jdbc:hive2://my-server:10000/> select foo, map_keys(bar), map_values(bar) from
broken_newline_map;
+-------+------------------+-----------------------------+--+
| foo | _c1 | _c2 |
+-------+------------------+-----------------------------+--+
| 1 | ["key2","key1"] | ["value2","value1"] |
| NULL | NULL | NULL |
| 2 | ["key2","key1"] | ["new value2","new value"] |
+-------+------------------+-----------------------------+--+
3 rows selected (28.05 seconds)
{code}
Obviously, the last result set contains corrupt entries (line 2). I also
encountered this when doing this query with JDBC.
> Newlines in Avro maps cause external table to return corrupt values
> -------------------------------------------------------------------
>
> Key: HIVE-14044
> URL: https://issues.apache.org/jira/browse/HIVE-14044
> Project: Hive
> Issue Type: Bug
> Environment: Hive version: 1.1.0-cdh5.5.1 (bundled with cloudera
> 5.5.1)
> Reporter: David Nies
>
> When {{\n}} characters are contained in Avro files that are used as data
> bases for an external table, the result of {{SELECT}} queries may be corrupt.
> I encountered this error when querying hive both from {{beeline}} and from
> JDBC.
> h3. Steps to reproduce (used files are attached to ticket)
> # Create an {{.avro}} file that contains newline characters in a value of a
> map:
> {code}
> avro-tools fromjson --schema-file test.schema test.json > test.avro
> {code}
> # Copy {{.avro}} file to HDFS
> {code}
> hdfs dfs -copyFromLocal test.avro /some/location/
> {code}
> # Create an external table in beeline containing this {{.avro}}:
> {code}
> beeline> CREATE EXTERNAL TABLE broken_newline_map
> ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
> STORED AS
> INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
> OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
> LOCATION '/user/dnies/hive-test/broken-newline/db'
> TBLPROPERTIES ('avro.schema.literal'='
> {
> "type" : "record",
> "name" : "myEntry",
> "namespace" : "myNamespace",
> "fields" : [ {
> "name" : "foo",
> "type" : "long"
> }, {
> "name" : "bar",
> "type" : {
> "type" : "map",
> "values" : "string"
> }
> } ]
> }
> ');
> {code}
> # Now, selecting may return corrupt results:
> {code}
> jdbc:hive2://my-server:10000/> select * from broken_newline_map;
> +-------------------------+---------------------------------------------------+--+
> | broken_newline_map.foo | broken_newline_map.bar
> |
> +-------------------------+---------------------------------------------------+--+
> | 1 | {"key2":"value2","key1":"value1\nafter newline"}
> |
> | 2 | {"key2":"new value2","key1":"new value"}
> |
> +-------------------------+---------------------------------------------------+--+
> 2 rows selected (1.661 seconds)
> jdbc:hive2://my-server:10000/> select foo, map_keys(bar), map_values(bar)
> from broken_newline_map;
> +-------+------------------+-----------------------------+--+
> | foo | _c1 | _c2 |
> +-------+------------------+-----------------------------+--+
> | 1 | ["key2","key1"] | ["value2","value1"] |
> | NULL | NULL | NULL |
> | 2 | ["key2","key1"] | ["new value2","new value"] |
> +-------+------------------+-----------------------------+--+
> 3 rows selected (28.05 seconds)
> {code}
> Obviously, the last result set contains corrupt entries (line 2). I also
> encountered this when doing this query with JDBC.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)