I am querying a Parquet file containing records for nested data:
SELECT household, minutes, lditem_array FROM
`dfs`.`root`.`./S/Beddo/spark/pig-xpdb` limit 1;
This returns the first record, which contains an array LDItem. For the first
record, LDItem is an array of 8 {} structures), like so:
1 046927725027 1135 {
"LDItem" : [ {
"upc" : "0001300000124",
"item_flags" : 0,
"units" : 1,
"retail" : 199,
"cost" : 130,
"discount_flags" : 0,
"discount_units" : 0,
"discount" : 0,
"return_units" : 0,
"return_discount" : 0
}, {
"upc" : "0003800084505",
"item_flags" : 0,
"units" : 1,
"retail" : 199,
"cost" : 121,
"discount_flags" : 1,
"discount_units" : 1,
"discount" : 49,
"return_units" : 0,
"return_discount" : 0
}, {
"upc" : "0078533175751",
"item_flags" : 0,
"units" : 1,
"retail" : 369,
"cost" : 233,
"discount_flags" : 0,
"discount_units" : 0,
"discount" : 0,
"return_units" : 0,
"return_discount" : 0
}, {
"upc" : "0003800084508",
"item_flags" : 0,
"units" : 1,
"retail" : 199,
"cost" : 120,
"discount_flags" : 1,
"discount_units" : 1,
"discount" : 49,
"return_units" : 0,
"return_discount" : 0
}, {
"upc" : "0001480000373",
"item_flags" : 0,
"units" : 1,
"retail" : 239,
"cost" : 180,
"discount_flags" : 0,
"discount_units" : 0,
"discount" : 0,
"return_units" : 0,
"return_discount" : 0
}, {
"upc" : "0004200015121",
"item_flags" : 0,
"units" : 1,
"retail" : 299,
"cost" : 185,
"discount_flags" : 1,
"discount_units" : 1,
"discount" : 30,
"return_units" : 0,
"return_discount" : 0
}, {
"upc" : "0003582608932",
"item_flags" : 0,
"units" : 1,
"retail" : 138,
"cost" : 79,
"discount_flags" : 0,
"discount_units" : 0,
"discount" : 0,
"return_units" : 0,
"return_discount" : 0
}, {
"upc" : "0003600043220",
"item_flags" : 0,
"units" : 1,
"retail" : 249,
"cost" : 168,
"discount_flags" : 1,
"discount_units" : 1,
"discount" : 49,
"return_units" : 0,
"return_discount" : 0
} ]
}
I would like to flatten this 1 record into 8 records, with columns for upc,
item_flags, units, retail, cost, ... The furthest I got was with this query:
SELECT household, minutes, FLATTEN(T.lditem_array.LDItem) FROM
`dfs`.`root`.`./S/Beddo/spark/pig-xpdb` T limit 8;
This flattened the first record, but did not yielded new columns upc,
item_flags, units, ... Instead, it gave me one record for each {} element,
which I've not been able to split further into columns:
1 046927725027 1135 {
"upc" : "0001300000124",
"item_flags" : 0,
"units" : 1,
"retail" : 199,
"cost" : 130,
"discount_flags" : 0,
"discount_units" : 0,
"discount" : 0,
"return_units" : 0,
"return_discount" : 0
}
2 046927725027 1135 {
"upc" : "0003800084505",
"item_flags" : 0,
"units" : 1,
"retail" : 199,
"cost" : 121,
"discount_flags" : 1,
"discount_units" : 1,
"discount" : 49,
"return_units" : 0,
"return_discount" : 0
}
3 046927725027 1135 {
"upc" : "0078533175751",
"item_flags" : 0,
"units" : 1,
"retail" : 369,
"cost" : 233,
"discount_flags" : 0,
"discount_units" : 0,
"discount" : 0,
"return_units" : 0,
"return_discount" : 0
}
4 046927725027 1135 {
"upc" : "0003800084508",
"item_flags" : 0,
"units" : 1,
"retail" : 199,
"cost" : 120,
"discount_flags" : 1,
"discount_units" : 1,
"discount" : 49,
"return_units" : 0,
"return_discount" : 0
}
5 046927725027 1135 {
"upc" : "0001480000373",
"item_flags" : 0,
"units" : 1,
"retail" : 239,
"cost" : 180,
"discount_flags" : 0,
"discount_units" : 0,
"discount" : 0,
"return_units" : 0,
"return_discount" : 0
}
6 046927725027 1135 {
"upc" : "0004200015121",
"item_flags" : 0,
"units" : 1,
"retail" : 299,
"cost" : 185,
"discount_flags" : 1,
"discount_units" : 1,
"discount" : 30,
"return_units" : 0,
"return_discount" : 0
}
7 046927725027 1135 {
"upc" : "0003582608932",
"item_flags" : 0,
"units" : 1,
"retail" : 138,
"cost" : 79,
"discount_flags" : 0,
"discount_units" : 0,
"discount" : 0,
"return_units" : 0,
"return_discount" : 0
}
8 046927725027 1135 {
"upc" : "0003600043220",
"item_flags" : 0,
"units" : 1,
"retail" : 249,
"cost" : 168,
"discount_flags" : 1,
"discount_units" : 1,
"discount" : 49,
"return_units" : 0,
"return_discount" : 0
}
How can I get a result set with 12 columns, i.e. split each {...} list into
columns? This is where I am stuck.
Thanks,
Mike Beddo