Hello, I have a problem to extract data from a column of type MAP<STRING, STRING>, impala returns me bad data. Please, tell me if there is a workaround to extract specific values from a map. This is version of impala I use: Server version: impalad version 2.3.0-cdh5.5.2 RELEASE (build cc1125f10419a7269366f7f950f57b24b07acd64)
Here is an example which doesn't work well:
-- this query returns correct result only for smallint_required_param_int,
CASE without CAST and GROUP_CONCAT return bad data
SELECT
e.user_id,
m.smallint_required_param_int,
m.smallint_required_param_str,
m.all_values_from_map,
e.date_id
FROM event e,
(SELECT
MAX(CASE WHEN key='smallint_required_param' THEN CAST(value AS SMALLINT)
END) AS smallint_required_param_int,
MAX(CASE WHEN key='smallint_required_param' THEN value END) AS
smallint_required_param_str,
GROUP_CONCAT(concat(key, ':', value), ',') as all_values_from_map
FROM e.event_map) m
WHERE e.event_id = 3;
Here is how to prepare data from attachment:
-- hive
CREATE TABLE event_unpartitioned
ROW FORMAT
SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES ('avro.schema.literal'='{
"namespace": "testing.hive.avro.serde",
"name": "events",
"type": "record",
"fields": [
{"name": "user_id", "type": ["null", "long"], "default": null},
{"name": "event_id", "type": ["null", "int"] , "default": null},
{"name": "event_type_id", "type": ["null", "int"] , "default": null},
{"name": "ts_bigint", "type": ["null", "long"], "default": null},
{"name": "event_map", "type": ["null", {"type": "map", "values":
"string"}], "default": null},
{"name": "server_id", "type": ["null", "int"], "default": null},
{"name": "date_id", "type": ["null", "int"], "default": null}
]
}');
-- impala
CREATE TABLE event (
user_id BIGINT,
event_id INT,
event_type_id INT,
ts_bigint BIGINT,
event_map MAP<STRING, STRING>,
server_id INT
)
PARTITIONED BY (date_id INT)
STORED AS PARQUET;
-- bash
hdfs dfs -put events.avro /user/hive/warehouse/event_unpartitioned/
-- hive
set hive.exec.dynamic.partition.mode=nonstrict;
insert into event partition (date_id) select * from event_unpartitioned;
-- impala
refresh event;
select * from event e, e.event_map; -- this query returns correct results
Thanks
*Dejan Prokić* | Data Engineer | Nordeus
events.avro
Description: Binary data
