Repository: kylin Updated Branches: refs/heads/master 4fcaa8ac3 -> 3b1850eae
KYLIN-1851 Add unit test of unsorted data and modify TrieDictionaryForest a little Signed-off-by: Li Yang <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kylin/repo Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/3b1850ea Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/3b1850ea Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/3b1850ea Branch: refs/heads/master Commit: 3b1850eae6ff9ec2ea5cd2d03fb1fa4e05aaafed Parents: 4fcaa8a Author: xiefan46 <[email protected]> Authored: Thu Nov 17 10:22:28 2016 +0800 Committer: Li Yang <[email protected]> Committed: Thu Nov 17 11:26:45 2016 +0800 ---------------------------------------------------------------------- .../apache/kylin/dict/TrieDictionaryForest.java | 59 +++++++++++++------- .../kylin/dict/TrieDictionaryForestTest.java | 43 ++++++++++++++ 2 files changed, 82 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kylin/blob/3b1850ea/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java index 9a26c4c..eb33a5a 100755 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java @@ -53,16 +53,19 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { private int baseId; + private ArrayList<ByteArray> maxValue; + public TrieDictionaryForest() { // default constructor for Writable interface } public TrieDictionaryForest(ArrayList<TrieDictionary<T>> trees, ArrayList<ByteArray> valueDivide, // - ArrayList<Integer> accuOffset, BytesConverter<T> bytesConverter, int baseId) { + ArrayList<Integer> accuOffset, BytesConverter<T> bytesConverter, int baseId) { this.trees = trees; this.valueDivide = valueDivide; this.accuOffset = accuOffset; this.bytesConvert = bytesConverter; this.baseId = baseId; + initMaxValue(); } @Override @@ -116,27 +119,30 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { // id = tree_inner_offset + accumulate_offset + baseId protected int _getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) throws IllegalArgumentException { - ByteArray search = new ByteArray(value, offset, len); - int index = findIndexByValue(search); - if (index < 0) { + int index; + if (trees.size() == 1) { + index = 0; + } else { + ByteArray search = new ByteArray(value, offset, len); + index = findIndexByValue(search); + if (index < 0) { + if (roundingFlag > 0) { + return getMinId(); //searching value smaller than the smallest value in dict + } else { + throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!"); + } + } + if (roundingFlag > 0) { - return getMinId(); //searching value smaller than the smallest value in dict - } else { - throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!"); + ByteArray maxValueOfTree = maxValue.get(index); + if (search.compareTo(maxValueOfTree) > 0) + index++; + if (index >= trees.size()) + throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!"); } } - int id; - if (roundingFlag > 0) { - T curTreeMax = trees.get(index).getValueFromId(trees.get(index).getMaxId()); - byte[] b1 = bytesConvert.convertToBytes(curTreeMax); - ByteArray ba1 = new ByteArray(b1, 0, b1.length); - if (search.compareTo(ba1) > 0) - index++; - if (index >= trees.size()) - throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!"); - } TrieDictionary<T> tree = trees.get(index); - id = tree.getIdFromValueBytes(value, offset, len, roundingFlag); + int id = tree.getIdFromValueBytes(value, offset, len, roundingFlag); id = id + accuOffset.get(index); id += baseId; return id; @@ -154,7 +160,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { @Override protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset) throws IllegalArgumentException { - int index = findIndexById(id); + int index = (trees.size() == 1) ? 0 : findIndexById(id); int treeInnerOffset = getTreeInnerOffset(id, index); TrieDictionary<T> tree = trees.get(index); int size = tree.getValueBytesFromIdImpl(treeInnerOffset, returnValue, offset); @@ -163,7 +169,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { @Override protected byte[] getValueBytesFromIdImpl(int id) throws IllegalArgumentException { - int index = findIndexById(id); + int index = (trees.size() == 1) ? 0 : findIndexById(id); int treeInnerOffset = getTreeInnerOffset(id, index); TrieDictionary<T> tree = trees.get(index); byte[] result = tree.getValueBytesFromId(treeInnerOffset); @@ -264,6 +270,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { dict.readFields(in); trees.add(dict); } + initMaxValue(); } catch (Exception e) { if (e instanceof RuntimeException) throw (RuntimeException) e; @@ -363,6 +370,18 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { } } + private void initMaxValue() throws IllegalStateException { + this.maxValue = new ArrayList<>(); + if (this.trees == null || trees.isEmpty()) + throw new IllegalStateException("Trees not init yet. Could not init max value of each tree"); + for (int i = 0; i < trees.size(); i++) { + T curTreeMax = trees.get(i).getValueFromId(trees.get(i).getMaxId()); + byte[] b1 = bytesConvert.convertToBytes(curTreeMax); + ByteArray ba1 = new ByteArray(b1, 0, b1.length); + this.maxValue.add(ba1); + } + } + public static void main(String[] args) { ArrayList<String> list = new ArrayList<>(); list.add("ä¸"); http://git-wip-us.apache.org/repos/asf/kylin/blob/3b1850ea/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java index 3def7e0..07511d1 100755 --- a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java +++ b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java @@ -38,6 +38,7 @@ import static org.junit.Assert.assertEquals; public class TrieDictionaryForestTest { + @Test public void testBasicFound() { ArrayList<String> strs = new ArrayList<String>(); @@ -432,6 +433,48 @@ public class TrieDictionaryForestTest { } } + @Test + public void testUnsortedData(){ + ArrayList<String> strs = new ArrayList<>(); + Iterator<String> it = new RandomStrings(10000).iterator(); + int totalSize = 0; + final StringBytesConverter converter = new StringBytesConverter(); + while (it.hasNext()) { + String str = it.next(); + byte[] data = converter.convertToBytes(str); + if (data != null) { + totalSize += data.length; + } + strs.add(str); + } + Collections.shuffle(strs); + int baseId = 20; + int maxTreeSize = totalSize / 10; + System.out.println("data size:" + totalSize / 1024 + "KB max tree size:" + maxTreeSize / 1024 + "KB"); + //test maintain one trie + TrieDictionaryForestBuilder<String> builder = new TrieDictionaryForestBuilder<String>(converter); + builder.setMaxTrieTreeSize(maxTreeSize); + for(String str : strs){ + builder.addValue(str); + } + TrieDictionaryForest<String> dict = builder.build(); + assertEquals(1,dict.getTrees().size()); + //test throws Exception + Collections.sort(strs); + strs.add("f"); + strs.add("a"); + builder = new TrieDictionaryForestBuilder<String>(converter); + builder.setMaxTrieTreeSize(maxTreeSize); + try { + for (String str : strs) + builder.addValue(str); + dict = builder.build(); + fail("Input data no sorted and builder have multi trees. Should throw IllegalStateException"); + }catch (IllegalStateException e){ + //correct + } + } + /* can not pass cases like 1.7695564055819624E-4 */
