DRILL-1673: Fix for error with flatten function when used with nested lists.
Added a small reference implementation of flatten for generating baselines remove now-unneeded code in EmptyValuePopulator as we are guarenteed to have the first offset initialized to 0. Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/233faf26 Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/233faf26 Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/233faf26 Branch: refs/heads/master Commit: 233faf2674c9ea2a9e416d01091bbf3658406f69 Parents: e659f01 Author: Jason Altekruse <[email protected]> Authored: Wed Jun 17 14:12:05 2015 -0700 Committer: Jason Altekruse <[email protected]> Committed: Sun Jun 28 22:28:27 2015 -0700 ---------------------------------------------------------------------- .../vector/complex/EmptyValuePopulator.java | 2 +- .../exec/vector/complex/RepeatedListVector.java | 5 +- .../java/org/apache/drill/DrillTestWrapper.java | 2 +- .../exec/physical/impl/flatten/TestFlatten.java | 180 ++++++++++++++++++- .../complex_transaction_example_data.json | 100 +++++++++++ .../src/test/resources/store/json/1673.json | 1 + 6 files changed, 284 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/drill/blob/233faf26/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/EmptyValuePopulator.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/EmptyValuePopulator.java b/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/EmptyValuePopulator.java index 8c61a60..fa1de66 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/EmptyValuePopulator.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/EmptyValuePopulator.java @@ -43,7 +43,7 @@ public class EmptyValuePopulator { final UInt4Vector.Accessor accessor = offsets.getAccessor(); final UInt4Vector.Mutator mutator = offsets.getMutator(); final int lastSet = Math.max(accessor.getValueCount() - 1, 0); - final int previousEnd = accessor.get(lastSet); + final int previousEnd = accessor.get(lastSet);//0 ? 0 : accessor.get(lastSet); for (int i = lastSet; i < lastIndex; i++) { mutator.setSafe(i + 1, previousEnd); } http://git-wip-us.apache.org/repos/asf/drill/blob/233faf26/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java b/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java index f538399..85e4d1d 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/RepeatedListVector.java @@ -153,7 +153,10 @@ public class RepeatedListVector extends AbstractContainerVector @Override public void splitAndTransfer(int startIndex, int length) { - throw new UnsupportedOperationException("Repeated list does not support split & transfer operation"); + target.allocateNew(); + for (int i = 0; i < length; i++) { + copyValueSafe(startIndex + i, i); + } } @Override http://git-wip-us.apache.org/repos/asf/drill/blob/233faf26/exec/java-exec/src/test/java/org/apache/drill/DrillTestWrapper.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/DrillTestWrapper.java b/exec/java-exec/src/test/java/org/apache/drill/DrillTestWrapper.java index d4e7ed6..77d2a54 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/DrillTestWrapper.java +++ b/exec/java-exec/src/test/java/org/apache/drill/DrillTestWrapper.java @@ -538,7 +538,7 @@ public class DrillTestWrapper { break; } if (!found) { - throw new Exception("Did not find expected record in result set: " + printRecord(expectedRecord)); + throw new Exception(String.format("After matching %d records, did not find expected record in result set: %s", counter, printRecord(expectedRecord))); } else { actualRecords.remove(i); counter++; http://git-wip-us.apache.org/repos/asf/drill/blob/233faf26/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/flatten/TestFlatten.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/flatten/TestFlatten.java b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/flatten/TestFlatten.java index 6f5a303..5756e97 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/flatten/TestFlatten.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/flatten/TestFlatten.java @@ -17,13 +17,22 @@ ******************************************************************************/ package org.apache.drill.exec.physical.impl.flatten; +import static org.apache.drill.TestBuilder.listOf; +import static org.apache.drill.TestBuilder.mapOf; import static org.junit.Assert.assertEquals; +import com.google.common.collect.Lists; import org.apache.drill.BaseTestQuery; +import org.apache.drill.TestBuilder; import org.apache.drill.common.util.FileUtils; -import org.apache.drill.exec.proto.UserBitShared; +import org.apache.drill.exec.fn.interp.TestConstantFolding; +import org.apache.drill.exec.util.JsonStringHashMap; import org.junit.Ignore; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.util.List; public class TestFlatten extends BaseTestQuery { @@ -36,6 +45,8 @@ public class TestFlatten extends BaseTestQuery { */ public static boolean RUN_ADVANCED_TESTS = false; + @Rule + public TemporaryFolder folder = new TemporaryFolder(); @Test public void testFlattenFailure() throws Exception { @@ -44,6 +55,141 @@ public class TestFlatten extends BaseTestQuery { } @Test + public void testFlatten_Drill2162_complex() throws Exception { + String path = folder.getRoot().toPath().toString(); + + String jsonRecords = BaseTestQuery.getFile("flatten/complex_transaction_example_data.json"); + int numCopies = 700; + new TestConstantFolding.SmallFileCreator(folder) + .setRecord(jsonRecords) + .createFiles(1, numCopies, "json"); + + List<JsonStringHashMap<String,Object>> data = Lists.newArrayList( + mapOf("uid", 1l, + "lst_lst_0", listOf(1l, 2l, 3l, 4l, 5l), + "lst_lst_1", listOf(2l, 3l, 4l, 5l, 6l), + "lst_lst", listOf( + listOf(1l, 2l, 3l, 4l, 5l), + listOf(2l, 3l, 4l, 5l, 6l)) + ), + mapOf("uid", 2l, + "lst_lst_0", listOf(1l, 2l, 3l, 4l, 5l), + "lst_lst_1", listOf(2l, 3l, 4l, 5l, 6l), + "lst_lst", listOf( + listOf(1l, 2l, 3l, 4l, 5l), + listOf(2l, 3l, 4l, 5l, 6l)) + ) + ); + + List<JsonStringHashMap<String, Object>> result = flatten(flatten(flatten(data, "lst_lst_1"), "lst_lst_0"), "lst_lst"); + + TestBuilder builder = testBuilder() + .sqlQuery("select uid, flatten(d.lst_lst[1]) lst1, flatten(d.lst_lst[0]) lst0, flatten(d.lst_lst) lst from " + + "dfs.`" + path + "/bigfile/bigfile.json` d") + .unOrdered() + .baselineColumns("uid", "lst1", "lst0", "lst"); + for (int i = 0; i < numCopies; i++) { + for (JsonStringHashMap<String, Object> record : result) { + builder.baselineValues(record.get("uid"), record.get("lst_lst_1"), record.get("lst_lst_0"), record.get("lst_lst")); + } + } + builder.go(); + }; + + @Test + public void testFlattenReferenceImpl() throws Exception { + List<JsonStringHashMap<String,Object>> data = Lists.newArrayList( + mapOf("a",1, + "b",2, + "list_col", listOf(10,9), + "nested_list_col",listOf( + listOf(100,99), + listOf(1000,999) + ))); + List<JsonStringHashMap<String, Object>> result = flatten(flatten(flatten(data, "list_col"), "nested_list_col"), "nested_list_col"); + List<JsonStringHashMap<String, Object>> expectedResult = Lists.newArrayList( + mapOf("nested_list_col", 100, "list_col", 10,"a", 1, "b",2), + mapOf("nested_list_col", 99, "list_col", 10,"a", 1, "b",2), + mapOf("nested_list_col", 1000, "list_col", 10,"a", 1, "b",2), + mapOf("nested_list_col", 999, "list_col", 10,"a", 1, "b",2), + mapOf("nested_list_col", 100, "list_col", 9, "a", 1, "b",2), + mapOf("nested_list_col", 99, "list_col", 9, "a", 1, "b",2), + mapOf("nested_list_col", 1000, "list_col", 9, "a", 1, "b",2), + mapOf("nested_list_col", 999, "list_col", 9, "a", 1, "b",2) + ); + int i = 0; + for (JsonStringHashMap record : result) { + assertEquals(record, expectedResult.get(i)); + i++; + } + } + + private List<JsonStringHashMap<String, Object>> flatten( + List<JsonStringHashMap<String,Object>> incomingRecords, + String colToFlatten) { + return flatten(incomingRecords, colToFlatten, colToFlatten); + } + + private List<JsonStringHashMap<String, Object>> flatten( + List<JsonStringHashMap<String,Object>> incomingRecords, + String colToFlatten, + String flattenedDataColName) { + List<JsonStringHashMap<String,Object>> output = Lists.newArrayList(); + for (JsonStringHashMap<String, Object> incomingRecord : incomingRecords) { + List dataToFlatten = (List) incomingRecord.get(colToFlatten); + for (int i = 0; i < dataToFlatten.size(); i++) { + final JsonStringHashMap newRecord = new JsonStringHashMap(); + newRecord.put(flattenedDataColName, dataToFlatten.get(i)); + for (String s : incomingRecord.keySet()) { + if (s.equals(colToFlatten)) { + continue; + } + newRecord.put(s, incomingRecord.get(s)); + } + output.add(newRecord); + } + } + return output; + } + + @Test + public void testFlatten_Drill2162_simple() throws Exception { + String path = folder.getRoot().toPath().toString(); + + List<Long> inputList = Lists.newArrayList(); + String jsonRecord = "{ \"int_list\" : ["; + final int listSize = 30; + for (int i = 1; i < listSize; i++ ) { + jsonRecord += i + ", "; + inputList.add((long) i); + } + jsonRecord += listSize + "] }"; + inputList.add((long) listSize); + int numRecords = 3000; + new TestConstantFolding.SmallFileCreator(folder) + .setRecord(jsonRecord) + .createFiles(1, numRecords, "json"); + + List<JsonStringHashMap<String,Object>> data = Lists.newArrayList( + mapOf("int_list", inputList) + ); + + List<JsonStringHashMap<String, Object>> result = flatten(data, "int_list"); + + TestBuilder builder = testBuilder() + .sqlQuery("select flatten(int_list) as int_list from dfs.`" + path + "/bigfile/bigfile.json`") + .unOrdered() + .baselineColumns("int_list"); + + for (int i = 0; i < numRecords; i++) { + for (JsonStringHashMap<String, Object> record : result) { + builder.baselineValues(record.get("int_list")); + } + } + builder.go(); + }; + + @Test public void drill1671() throws Exception{ int rowCount = testSql("select * from (select count(*) as cnt from (select id, flatten(evnts1), flatten(evnts2), flatten(evnts3), flatten(evnts4), flatten(evnts5), flatten(evnts6), flatten(evnts7), flatten(evnts8), flatten(evnts9), flatten(evnts10), flatten(evnts11) from cp.`/flatten/many-arrays-50.json`)x )y where cnt = 2048"); assertEquals(rowCount, 1); @@ -55,6 +201,36 @@ public class TestFlatten extends BaseTestQuery { test("select * from cp.`/flatten/empty-rm.json`"); } + @Test // repeated list within a repeated map + public void drill1673() throws Exception { + String path = folder.getRoot().toPath().toString(); + + String jsonRecords = BaseTestQuery.getFile("store/json/1673.json"); + int numCopies = 25000; + new TestConstantFolding.SmallFileCreator(folder) + .setRecord(jsonRecords) + .createFiles(1, numCopies, "json"); + + TestBuilder builder = testBuilder() + .sqlQuery("select t.fixed_column as fixed_column, " + + "flatten(t.list_column) as list_col " + + "from dfs.`" + path + "/bigfile/bigfile.json` as t") + .baselineColumns("fixed_column", "list_col") + .unOrdered(); + Object map1 = mapOf("id1", "1", + "name", "zhu", + "num", listOf(listOf(1l, 2l, 3l))); + Object map2 = mapOf("id1", "2", + "name", "hao", + "num", listOf(listOf(4l, 5l, 6l))); + for (int i = 0; i < numCopies; i++) { + builder.baselineValues("abc", map1); + builder.baselineValues("abc", map2); + } + + builder.go(); + } + @Test public void drill1653() throws Exception{ int rowCount = testSql("select * from (select sum(t.flat.`value`) as sm from (select id, flatten(kvgen(m)) as flat from cp.`/flatten/missing-map.json`)t) where sm = 10 "); @@ -88,8 +264,6 @@ public class TestFlatten extends BaseTestQuery { "on transaction_info.max_event_time = event_info.event.event_time;"); } - - @Test public void testKVGenFlatten1() throws Exception { // works - TODO and verify results http://git-wip-us.apache.org/repos/asf/drill/blob/233faf26/exec/java-exec/src/test/resources/flatten/complex_transaction_example_data.json ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/flatten/complex_transaction_example_data.json b/exec/java-exec/src/test/resources/flatten/complex_transaction_example_data.json new file mode 100644 index 0000000..039ce18 --- /dev/null +++ b/exec/java-exec/src/test/resources/flatten/complex_transaction_example_data.json @@ -0,0 +1,100 @@ +{ + "uid" : 1, + "uid_str" : "01", + "type" : "web", + "max_trans_amount" : 1000, + "events" : [ + { "evnt_id":"e1", "campaign_id":"c1", "event_name":"e1_name", "event_time":1000000, "type" : "cmpgn1"}, + { "evnt_id":"e2", "campaign_id":"c1", "event_name":"e2_name", "event_time":2000000, "type" : "cmpgn4"}, + { "evnt_id":"e3", "campaign_id":"c1", "event_name":"e3_name", "event_time":3000000, "type" : "cmpgn1"}, + { "evnt_id":"e4", "campaign_id":"c1", "event_name":"e4_name", "event_time":4000000, "type" : "cmpgn1"}, + { "evnt_id":"e5", "campaign_id":"c2", "event_name":"e5_name", "event_time":5000000, "type" : "cmpgn3"}, + { "evnt_id":"e6", "campaign_id":"c1", "event_name":"e6_name", "event_time":6000000, "type" : "cmpgn9"}, + { "evnt_id":"e7", "campaign_id":"c1", "event_name":"e7_name", "event_time":7000000, "type" : "cmpgn3"}, + { "evnt_id":"e8", "campaign_id":"c2", "event_name":"e8_name", "event_time":8000000, "type" : "cmpgn2"}, + { "evnt_id":"e9", "campaign_id":"c2", "event_name":"e9_name", "event_time":9000000, "type" : "cmpgn4"} + ], + "transactions" : [ + { "trans_id":"t1", "amount":100, "trans_time":7777777, "type":"sports"}, + { "trans_id":"t2", "amount":1000, "trans_time":8888888, "type":"groceries"} + ], + "map":{"rm": [ + {"mapid":"m1","mapvalue":{"col1":1,"col2":[0,1,2,3,4,5]},"rptd": [{ "a": "foo"},{"b":"boo"}]}, + {"mapid":"m2","mapvalue":{"col1":0,"col2":[]},"rptd": [{ "a": "bar"},{"c":1},{"d":4.5}]} + ]}, + "sub": [{"z1":"hello"},{"z2":1}], + "features": [ + { + "type": "Feature", + "properties": { + "mag": 6.9, + "time": 1405954481000, + "updated": 1405983436259 + }, + "geometry": { + "type": "Point", + "coordinates": "100,90" + }, + "id": "usb000ruzk", + "location": { + "zip": "95134", + "street": "zanker", + "bldgs": { + "bldg1" : "HQ1", + "bldg2" : "HQ2" + } + } + } + ], + "lst_lst" : [[1,2,3,4,5],[2,3,4,5,6]] +} +{ + "uid" : 2, + "uid_str" : "02", + "type" : "store", + "max_trans_amount" : 2001, + "events" : [ + { "evnt_id":"e1", "campaign_id":"c1", "event_name":"e1_name", "event_time":1000000, "type" : "cmpgn9"}, + { "evnt_id":"e2", "campaign_id":"c1", "event_name":"e2_name", "event_time":2000000, "type" : "cmpgn4"}, + { "evnt_id":"e3", "campaign_id":"c1", "event_name":"e3_name", "event_time":3000000, "type" : "cmpgn1"}, + { "evnt_id":"e4", "campaign_id":"c1", "event_name":"e4_name", "event_time":4000000, "type" : "cmpgn1"}, + { "evnt_id":"e5", "campaign_id":"c2", "event_name":"e5_name", "event_time":5000000, "type" : "cmpgn2"}, + { "evnt_id":"e6", "campaign_id":"c1", "event_name":"e6_name", "event_time":6000000, "type" : "cmpgn9"}, + { "evnt_id":"e7", "campaign_id":"c1", "event_name":"e7_name", "event_time":7000000, "type" : "cmpgn3"}, + { "evnt_id":"e8", "campaign_id":"c2", "event_name":"e8_name", "event_time":8000000, "type" : "cmpgn2"}, + { "evnt_id":"e9", "campaign_id":"c2", "event_name":"e9_name", "event_time":9000000, "type" : "cmpgn4"} + ], + "transactions" : [ + { "trans_id":"t1", "amount":100, "trans_time":7777777, "type":"sports"}, + { "trans_id":"t2", "amount":1000, "trans_time":8888888, "type":"groceries"} + ], + "map":{"rm": [ + {"mapid":"m0","mapvalue":{"col1":1,"col2":[0,1,2,3,4,5]},"rptd": [{ "a": "foo1"},{"b":"boo"}]}, + {"mapid":"m1","mapvalue":{"col1":0,"col2":[]},"rptd": [{ "a": "bar"},{"c":-1},{"d":4.5}]} + ]}, + "sub": [{"z1":"hello"},{"z2":10}], + "features": [ + { + "type": "cmpgn9", + "properties": { + "mag": 6.9, + "time": 1405954481000, + "updated": 1405983436259 + }, + "geometry": { + "type": "Point", + "coordinates": "100,90" + }, + "id": "usb000ruzk", + "location": { + "zip": "95134", + "street": "zanker", + "bldgs": { + "bldg1" : "HQ1", + "bldg2" : "HQ2" + } + } + } + ], + "lst_lst" : [[1,2,3,4,5],[2,3,4,5,6]] +} http://git-wip-us.apache.org/repos/asf/drill/blob/233faf26/exec/java-exec/src/test/resources/store/json/1673.json ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/store/json/1673.json b/exec/java-exec/src/test/resources/store/json/1673.json new file mode 100644 index 0000000..b887d40 --- /dev/null +++ b/exec/java-exec/src/test/resources/store/json/1673.json @@ -0,0 +1 @@ +{"fixed_column":"abc", "list_column":[{"id1":"1","name":"zhu", "num": [[1,2,3]]}, {"id1":"2","name":"hao", "num": [[4,5,6]]} ]} \ No newline at end of file
