Repository: kylin Updated Branches: refs/heads/sparkcubing-rebase a9c8b61bd -> afd1ac223 (forced update)
KYLIN-2322 fix TictionaryDictionaryForest cache bug and add CacheDictionary interface Signed-off-by: Li Yang <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kylin/repo Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/1d6a36bf Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/1d6a36bf Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/1d6a36bf Branch: refs/heads/sparkcubing-rebase Commit: 1d6a36bf66085e0bed79876379b21677ec55788f Parents: d7971d9 Author: xiefan46 <[email protected]> Authored: Wed Dec 28 18:10:07 2016 +0800 Committer: Li Yang <[email protected]> Committed: Tue Jan 3 15:43:24 2017 +0800 ---------------------------------------------------------------------- .../apache/kylin/common/KylinConfigBase.java | 2 +- .../apache/kylin/common/util/Dictionary.java | 2 + .../apache/kylin/dict/AppendTrieDictionary.java | 46 +---- .../org/apache/kylin/dict/CacheDictionary.java | 107 +++++++++++ .../apache/kylin/dict/DateStrDictionary.java | 1 + .../org/apache/kylin/dict/NumberDictionary.java | 26 +-- .../apache/kylin/dict/TimeStrDictionary.java | 1 + .../org/apache/kylin/dict/TrieDictionary.java | 137 +------------- .../apache/kylin/dict/TrieDictionaryForest.java | 45 +---- .../kylin/dict/TrieDictionaryForestBuilder.java | 1 - .../MultipleDictionaryValueEnumeratorTest.java | 1 + .../dict/TrieDictionaryForestBenchmark.java | 180 +++++++++++++++++++ .../kylin/dict/TrieDictionaryForestTest.java | 35 ++-- 13 files changed, 325 insertions(+), 259 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java ---------------------------------------------------------------------- diff --git a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java index d73b694..bb8880b 100644 --- a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java +++ b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java @@ -217,7 +217,7 @@ abstract public class KylinConfigBase implements Serializable { // ============================================================================ public boolean isUseForestTrieDictionary() { - return Boolean.parseBoolean(getOptional("kylin.dictionary.use-forest-trie", "false")); + return Boolean.parseBoolean(getOptional("kylin.dictionary.use-forest-trie", "true")); } public int getTrieDictionaryForestMaxTrieSizeMB() { http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java ---------------------------------------------------------------------- diff --git a/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java b/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java index 1e172bc..03996a7 100644 --- a/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java +++ b/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java @@ -254,4 +254,6 @@ abstract public class Dictionary<T> implements Serializable { */ public abstract void readFields(DataInput in) throws IOException; + + } http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-dictionary/src/main/java/org/apache/kylin/dict/AppendTrieDictionary.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/AppendTrieDictionary.java b/core-dictionary/src/main/java/org/apache/kylin/dict/AppendTrieDictionary.java index 32bfde6..503c29e 100644 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/AppendTrieDictionary.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/AppendTrieDictionary.java @@ -27,10 +27,8 @@ import java.io.DataOutputStream; import java.io.IOException; import java.io.PrintStream; import java.io.UnsupportedEncodingException; -import java.lang.ref.SoftReference; import java.util.ArrayList; import java.util.Arrays; -import java.util.HashMap; import java.util.HashSet; import java.util.IdentityHashMap; import java.util.LinkedList; @@ -76,7 +74,7 @@ import org.slf4j.LoggerFactory; * @author sunyerui */ @SuppressWarnings({ "rawtypes", "unchecked", "serial" }) -public class AppendTrieDictionary<T> extends Dictionary<T> { +public class AppendTrieDictionary<T> extends CacheDictionary<T> { public static final byte[] HEAD_MAGIC = new byte[] { 0x41, 0x70, 0x70, 0x65, 0x63, 0x64, 0x54, 0x72, 0x69, 0x65, 0x44, 0x69, 0x63, 0x74 }; // "AppendTrieDict" public static final int HEAD_SIZE_I = HEAD_MAGIC.length; @@ -87,22 +85,16 @@ public class AppendTrieDictionary<T> extends Dictionary<T> { private static final Logger logger = LoggerFactory.getLogger(AppendTrieDictionary.class); transient private String baseDir; - transient private int baseId; transient private int maxId; transient private int maxValueLength; transient private int nValues; - transient private BytesConverter<T> bytesConverter; volatile private TreeMap<DictSliceKey, DictSlice> dictSliceMap; - transient private boolean enableValueCache = true; - transient private SoftReference<HashMap> valueToIdCache; // Constructor both for build and deserialize public AppendTrieDictionary() { - if (enableValueCache) { - valueToIdCache = new SoftReference<>(new HashMap()); - } + enableCache(); } public void initParams(String baseDir, int baseId, int maxId, int maxValueLength, int nValues, BytesConverter bytesConverter) throws IOException { @@ -111,7 +103,7 @@ public class AppendTrieDictionary<T> extends Dictionary<T> { this.maxId = maxId; this.maxValueLength = maxValueLength; this.nValues = nValues; - this.bytesConverter = bytesConverter; + this.bytesConvert = bytesConverter; } public void initDictSliceMap(CachedTreeMap dictMap) throws IOException { @@ -893,7 +885,7 @@ public class AppendTrieDictionary<T> extends Dictionary<T> { } else { logger.info("GlobalDict {} exist, append value", resourcePath); builder = new Builder<>(resourcePath, dictToUse, dictToUse.baseDir, dictToUse.maxId, dictToUse.maxValueLength, - dictToUse.nValues, dictToUse.bytesConverter, dictToUse.writeDictMap()); + dictToUse.nValues, dictToUse.bytesConvert, dictToUse.writeDictMap()); } return builder; @@ -1156,31 +1148,6 @@ public class AppendTrieDictionary<T> extends Dictionary<T> { return maxValueLength; } - @Override - final protected int getIdFromValueImpl(T value, int roundingFlag) { - if (enableValueCache && roundingFlag == 0) { - HashMap cache = valueToIdCache.get(); // SoftReference to skip cache gracefully when short of memory - if (cache != null) { - Integer id = null; - id = (Integer) cache.get(value); - if (id != null) - return id.intValue(); - - byte[] valueBytes = bytesConverter.convertToBytes(value); - id = getIdFromValueBytes(valueBytes, 0, valueBytes.length, roundingFlag); - - cache.put(value, id); - return id; - } - } - byte[] valueBytes = bytesConverter.convertToBytes(value); - return getIdFromValueBytes(valueBytes, 0, valueBytes.length, roundingFlag); - } - - @Override - final protected T getValueFromIdImpl(int id) { - throw new UnsupportedOperationException("AppendTrieDictionary can't retrive value from id"); - } @Override protected byte[] getValueBytesFromIdImpl(int id) { @@ -1198,7 +1165,7 @@ public class AppendTrieDictionary<T> extends Dictionary<T> { indexOut.writeInt(maxId); indexOut.writeInt(maxValueLength); indexOut.writeInt(nValues); - indexOut.writeUTF(bytesConverter.getClass().getName()); + indexOut.writeUTF(bytesConvert.getClass().getName()); dictSliceMap.write(indexOut); dictSliceMap.commit(keepAppend); } @@ -1208,7 +1175,7 @@ public class AppendTrieDictionary<T> extends Dictionary<T> { public AppendTrieDictionary copyToAnotherMeta(KylinConfig srcConfig, KylinConfig dstConfig) throws IOException { Configuration conf = new Configuration(); AppendTrieDictionary newDict = new AppendTrieDictionary(); - newDict.initParams(baseDir.replaceFirst(srcConfig.getHdfsWorkingDirectory(), dstConfig.getHdfsWorkingDirectory()), baseId, maxId, maxValueLength, nValues, bytesConverter); + newDict.initParams(baseDir.replaceFirst(srcConfig.getHdfsWorkingDirectory(), dstConfig.getHdfsWorkingDirectory()), baseId, maxId, maxValueLength, nValues, bytesConvert); newDict.initDictSliceMap((CachedTreeMap)dictSliceMap); logger.info("Copy AppendDict from {} to {}", this.baseDir, newDict.baseDir); Path srcPath = new Path(this.baseDir); @@ -1256,6 +1223,7 @@ public class AppendTrieDictionary<T> extends Dictionary<T> { } } + @Override public void dump(PrintStream out) { out.println("Total " + nValues + " values, " + (dictSliceMap == null ? 0 : dictSliceMap.size()) + " slice"); http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-dictionary/src/main/java/org/apache/kylin/dict/CacheDictionary.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/CacheDictionary.java b/core-dictionary/src/main/java/org/apache/kylin/dict/CacheDictionary.java new file mode 100644 index 0000000..575358e --- /dev/null +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/CacheDictionary.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +package org.apache.kylin.dict; + +import org.apache.kylin.common.util.Dictionary; + +import java.lang.ref.SoftReference; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Created by xiefan on 16-12-30. + */ +abstract public class CacheDictionary<T> extends Dictionary<T> { + private static final long serialVersionUID = 1L; + + transient protected boolean enableValueCache = true; + + transient private SoftReference<Map> valueToIdCache; + + transient private SoftReference<Object[]> idToValueCache; + + transient protected int baseId; + + transient protected BytesConverter<T> bytesConvert; + + public CacheDictionary() { + + } + + //value --> id + @Override + final protected int getIdFromValueImpl(T value, int roundingFlag) { + if (enableValueCache && roundingFlag == 0) { + Map cache = valueToIdCache.get(); // SoftReference to skip cache gracefully when short of memory + if (cache != null) { + Integer id = null; + id = (Integer) cache.get(value); + if (id != null) + return id.intValue(); + byte[] valueBytes = bytesConvert.convertToBytes(value); + id = getIdFromValueBytes(valueBytes, 0, valueBytes.length, roundingFlag); + cache.put(value, id); + return id; + } + } + byte[] valueBytes = bytesConvert.convertToBytes(value); + return getIdFromValueBytes(valueBytes, 0, valueBytes.length, roundingFlag); + } + + //id --> value + @Override + final protected T getValueFromIdImpl(int id) { + if (enableValueCache) { + Object[] cache = idToValueCache.get(); + if (cache != null) { + int seq = calcSeqNoFromId(id); + if (cache[seq] != null) + return (T) cache[seq]; + byte[] valueBytes = getValueBytesFromIdImpl(id); + T value = bytesConvert.convertFromBytes(valueBytes, 0, valueBytes.length); + cache[seq] = value; + return value; + } + } + byte[] valueBytes = getValueBytesFromIdImpl(id); + return bytesConvert.convertFromBytes(valueBytes, 0, valueBytes.length); + } + + final protected int calcSeqNoFromId(int id) { + int seq = id - baseId; + if (seq < 0 || seq >= getSize()) { + throw new IllegalArgumentException("Not a valid ID: " + id); + } + return seq; + } + + final public void enableCache() { + this.enableValueCache = true; + if (this.valueToIdCache == null) + this.valueToIdCache = new SoftReference<Map>(new ConcurrentHashMap()); + if (this.idToValueCache == null) + this.idToValueCache = new SoftReference<Object[]>(new Object[getSize()]); + } + + final public void disableCache() { + this.enableValueCache = false; + this.valueToIdCache = null; + this.idToValueCache = null; + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-dictionary/src/main/java/org/apache/kylin/dict/DateStrDictionary.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/DateStrDictionary.java b/core-dictionary/src/main/java/org/apache/kylin/dict/DateStrDictionary.java index ee8534f..29bbee2 100644 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/DateStrDictionary.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/DateStrDictionary.java @@ -156,6 +156,7 @@ public class DateStrDictionary extends Dictionary<String> { init(pattern, baseId); } + @Override public int hashCode() { return 31 * baseId + pattern.hashCode(); http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionary.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionary.java b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionary.java index 12efbd3..f1b1b3d 100644 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionary.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/NumberDictionary.java @@ -194,30 +194,6 @@ public class NumberDictionary<T> extends TrieDictionary<T> { return codec.decodeNumber(returnValue, offset); } - @Override - public void enableIdToValueBytesCache() { - enableIdToValueBytesCache(new EnableIdToValueBytesCacheVisitor() { - NumberBytesCodec codec = getCodec(); - byte[] tmp = new byte[getSizeOfValue()]; - - @Override - public byte[] getBuffer() { - return codec.buf; - } - - @Override - public byte[] makeValueBytes(byte[] buf, int length) { - // the given buf is the codec buf, which we returned in getBuffer() - codec.bufOffset = 0; - codec.bufLen = length; - int numLen = codec.decodeNumber(tmp, 0); - - byte[] result = new byte[numLen]; - System.arraycopy(tmp, 0, result, 0, numLen); - return result; - } - }); - } public static void main(String[] args) throws Exception { NumberDictionaryBuilder<String> b = new NumberDictionaryBuilder<String>(new StringBytesConverter()); @@ -227,7 +203,7 @@ public class NumberDictionary<T> extends TrieDictionary<T> { b.addValue("7"); TrieDictionary<String> dict = b.build(0); - dict.enableIdToValueBytesCache(); + //dict.enableIdToValueBytesCache(); for (int i = 0; i <= dict.getMaxId(); i++) { System.out.println(Bytes.toString(dict.getValueBytesFromId(i))); } http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-dictionary/src/main/java/org/apache/kylin/dict/TimeStrDictionary.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TimeStrDictionary.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TimeStrDictionary.java index fc3db5f..eabc9f1 100644 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/TimeStrDictionary.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TimeStrDictionary.java @@ -147,4 +147,5 @@ public class TimeStrDictionary extends Dictionary<String> { @Override public void readFields(DataInput in) throws IOException { } + } http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java index c099de0..957207e 100644 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java @@ -27,10 +27,8 @@ import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.PrintStream; -import java.lang.ref.SoftReference; import java.util.Arrays; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; + import org.apache.kylin.common.util.Bytes; import org.apache.kylin.common.util.BytesUtil; @@ -55,7 +53,7 @@ import com.google.common.base.Preconditions; * @author yangli9 */ @SuppressWarnings({ "rawtypes", "unchecked" }) -public class TrieDictionary<T> extends Dictionary<T> { +public class TrieDictionary<T> extends CacheDictionary<T> { private static final long serialVersionUID = 1L; public static final byte[] MAGIC = new byte[] { 0x54, 0x72, 0x69, 0x65, 0x44, 0x69, 0x63, 0x74 }; // "TrieDict" @@ -74,21 +72,13 @@ public class TrieDictionary<T> extends Dictionary<T> { transient private int bodyLen; transient private int sizeChildOffset; transient private int sizeNoValuesBeneath; - transient private int baseId; transient private int maxValueLength; - transient private BytesConverter<T> bytesConvert; transient private int nValues; transient private int sizeOfId; transient private long childOffsetMask; transient private int firstByteOffset; - transient private boolean enableValueCache = true; - transient private SoftReference<Map> valueToIdCache; - transient private SoftReference<Object[]> idToValueCache; - - transient private boolean enableIdToValueBytesCache = false; - transient private byte[][] idToValueBytesCache; public TrieDictionary() { // default constructor for Writable interface } @@ -120,16 +110,13 @@ public class TrieDictionary<T> extends Dictionary<T> { this.sizeOfId = BytesUtil.sizeForValue(baseId + nValues + 1); // note baseId could raise 1 byte in ID space, +1 to reserve all 0xFF for NULL case this.childOffsetMask = ~((long) (BIT_IS_LAST_CHILD | BIT_IS_END_OF_VALUE) << ((sizeChildOffset - 1) * 8)); this.firstByteOffset = sizeChildOffset + sizeNoValuesBeneath + 1; // the offset from begin of node to its first value byte + enableCache(); } catch (Exception e) { if (e instanceof RuntimeException) throw (RuntimeException) e; else throw new RuntimeException(e); } - if (enableValueCache) { - valueToIdCache = new SoftReference<Map>(new ConcurrentHashMap()); - idToValueCache = new SoftReference<Object[]>(new Object[nValues]); - } } @Override @@ -152,26 +139,6 @@ public class TrieDictionary<T> extends Dictionary<T> { return maxValueLength; } - @Override - final protected int getIdFromValueImpl(T value, int roundingFlag) { - if (enableValueCache && roundingFlag == 0) { - Map cache = valueToIdCache.get(); // SoftReference to skip cache gracefully when short of memory - if (cache != null) { - Integer id = null; - id = (Integer) cache.get(value); - if (id != null) - return id.intValue(); - - byte[] valueBytes = bytesConvert.convertToBytes(value); - id = getIdFromValueBytes(valueBytes, 0, valueBytes.length, roundingFlag); - - cache.put(value, id); - return id; - } - } - byte[] valueBytes = bytesConvert.convertToBytes(value); - return getIdFromValueBytes(valueBytes, 0, valueBytes.length, roundingFlag); - } @Override protected int getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) { @@ -267,35 +234,9 @@ public class TrieDictionary<T> extends Dictionary<T> { return k; } - @Override - final protected T getValueFromIdImpl(int id) { - if (enableValueCache) { - Object[] cache = idToValueCache.get(); // SoftReference to skip cache gracefully when short of memory - if (cache != null) { - int seq = calcSeqNoFromId(id); - if (cache[seq] != null) - return (T) cache[seq]; - - byte[] value = new byte[getSizeOfValue()]; - int length = getValueBytesFromId(id, value, 0); - T result = bytesConvert.convertFromBytes(value, 0, length); - - cache[seq] = result; - return result; - } - } - byte[] value = new byte[getSizeOfValue()]; - int length = getValueBytesFromId(id, value, 0); - return bytesConvert.convertFromBytes(value, 0, length); - } @Override protected byte[] getValueBytesFromIdImpl(int id) { - if (enableIdToValueBytesCache) { - int seq = calcSeqNoFromId(id); - return idToValueBytesCache[seq]; - } - byte[] buf = new byte[maxValueLength]; int len = getValueBytesFromIdImpl(id, buf, 0); @@ -363,68 +304,6 @@ public class TrieDictionary<T> extends Dictionary<T> { } } - public void enableIdToValueBytesCache() { - enableIdToValueBytesCache(new EnableIdToValueBytesCacheVisitor() { - @Override - public byte[] getBuffer() { - return new byte[getSizeOfValue()]; - } - - @Override - public byte[] makeValueBytes(byte[] buf, int length) { - byte[] valueBytes = new byte[length]; - System.arraycopy(buf, 0, valueBytes, 0, length); - return valueBytes; - } - }); - } - - interface EnableIdToValueBytesCacheVisitor { - byte[] getBuffer(); - - byte[] makeValueBytes(byte[] buf, int length); - } - - protected void enableIdToValueBytesCache(EnableIdToValueBytesCacheVisitor visitor) { - enableIdToValueBytesCache = true; - idToValueBytesCache = new byte[nValues][]; - enableIdToValueBytesCache_recursion(headSize, 0, visitor.getBuffer(), 0, visitor); - } - - private void enableIdToValueBytesCache_recursion(int n, int seq, byte[] buf, int tail, EnableIdToValueBytesCacheVisitor visitor) { - // write current node value - int p = n + firstByteOffset; - int len = BytesUtil.readUnsigned(trieBytes, p - 1, 1); - System.arraycopy(trieBytes, p, buf, tail, len); - tail += len; - - // if the value is ended - boolean isEndOfValue = checkFlag(n, BIT_IS_END_OF_VALUE); - if (isEndOfValue) { - idToValueBytesCache[seq] = visitor.makeValueBytes(buf, tail); - seq++; - } - - // find a child to continue - int c = getChildOffset(n); - if (c == headSize) // has no children - return; - - // process each child - while (true) { - enableIdToValueBytesCache_recursion(c, seq, buf, tail, visitor); - - int nValuesBeneath = BytesUtil.readUnsigned(trieBytes, c + sizeChildOffset, sizeNoValuesBeneath); - seq += nValuesBeneath; - - // go next child - if (checkFlag(c, BIT_IS_LAST_CHILD)) - break; // no more child? we are done - p = c + firstByteOffset; - c = p + BytesUtil.readUnsigned(trieBytes, p - 1, 1); - } - } - private boolean checkFlag(int offset, int bit) { return (trieBytes[offset] & bit) > 0; } @@ -436,14 +315,6 @@ public class TrieDictionary<T> extends Dictionary<T> { return baseId + seq; } - private int calcSeqNoFromId(int id) { - int seq = id - baseId; - if (seq < 0 || seq >= nValues) { - throw new IllegalArgumentException("Not a valid ID: " + id); - } - return seq; - } - @Override public void write(DataOutput out) throws IOException { out.write(trieBytes); @@ -552,7 +423,7 @@ public class TrieDictionary<T> extends Dictionary<T> { Preconditions.checkArgument(dict2.contains(dict)); Preconditions.checkArgument(dict.equals(dict2)); - dict2.enableIdToValueBytesCache(); + //dict2.enableIdToValueBytesCache(); for (int i = 0; i <= dict.getMaxId(); i++) { System.out.println(Bytes.toString(dict.getValueBytesFromId(i))); } http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java index e746348..c655854 100755 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java @@ -27,6 +27,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; + import org.apache.kylin.common.util.ByteArray; import org.apache.kylin.common.util.Bytes; import org.apache.kylin.common.util.BytesUtil; @@ -40,18 +41,14 @@ import org.apache.kylin.common.util.Dictionary; * <p> * Created by xiefan on 16-10-26. */ -public class TrieDictionaryForest<T> extends Dictionary<T> { +public class TrieDictionaryForest<T> extends CacheDictionary<T> { private static final long serialVersionUID = 1L; private ArrayList<TrieDictionary<T>> trees; private ArrayList<ByteArray> valueDivide; - private ArrayList<Integer> accuOffset; //find tree - - private BytesConverter<T> bytesConvert; - - private int baseId; + private ArrayList<Integer> accuOffset; private ArrayList<ByteArray> maxValue; @@ -66,6 +63,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { this.bytesConvert = bytesConverter; this.baseId = baseId; initMaxValue(); + enableCache(); } @Override @@ -102,12 +100,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { return maxValue; } - // value --> id - @Override - protected int getIdFromValueImpl(T value, int roundingFlag) throws IllegalArgumentException { - byte[] valueBytes = bytesConvert.convertToBytes(value); - return getIdFromValueBytesImpl(valueBytes, 0, valueBytes.length, roundingFlag); - } @Override protected int getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) throws IllegalArgumentException { @@ -148,15 +140,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { return id; } - @Override - protected T getValueFromIdImpl(int id) throws IllegalArgumentException { - byte[] data = getValueBytesFromIdImpl(id); - if (data != null) { - return bytesConvert.convertFromBytes(data, 0, data.length); - } else { - return null; - } - } + @Override protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset) throws IllegalArgumentException { @@ -271,6 +255,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { trees.add(dict); } initMaxValue(); + enableCache(); } catch (Exception e) { if (e instanceof RuntimeException) throw (RuntimeException) e; @@ -383,22 +368,4 @@ public class TrieDictionaryForest<T> extends Dictionary<T> { } } - public static void main(String[] args) { - ArrayList<String> list = new ArrayList<>(); - list.add("ä¸"); - list.add("äº"); - list.add("ä¸"); - list.add(""); - list.add("part"); - list.add("par"); - list.add("partition"); - list.add("party"); - list.add("parties"); - list.add("paint"); - Collections.sort(list); - for (String str : list) { - System.out.println("found value:" + str + " index:" + lowerBound(str, list)); - } - } - } http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java index 4ee30f0..af2e302 100755 --- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java +++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java @@ -111,7 +111,6 @@ public class TrieDictionaryForestBuilder<T> { reset(); } TrieDictionaryForest<T> forest = new TrieDictionaryForest<T>(this.trees, this.valueDivide, this.accuOffset, this.bytesConverter, baseId); - // if input values are not in ascending order and tree num>1,TrieDictionaryForest can not work correctly. if (forest.getTrees().size() > 1 && !isOrdered) { throw new IllegalStateException("Invalid input data. Unordered data can not be split into multi trees"); http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-dictionary/src/test/java/org/apache/kylin/dict/MultipleDictionaryValueEnumeratorTest.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/MultipleDictionaryValueEnumeratorTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/MultipleDictionaryValueEnumeratorTest.java index ad166c2..73e0935 100644 --- a/core-dictionary/src/test/java/org/apache/kylin/dict/MultipleDictionaryValueEnumeratorTest.java +++ b/core-dictionary/src/test/java/org/apache/kylin/dict/MultipleDictionaryValueEnumeratorTest.java @@ -158,6 +158,7 @@ public class MultipleDictionaryValueEnumeratorTest { @Override public void readFields(DataInput in) throws IOException {} + @Override public boolean contains(Dictionary another) { return false; http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestBenchmark.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestBenchmark.java b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestBenchmark.java new file mode 100644 index 0000000..0b4c0e3 --- /dev/null +++ b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestBenchmark.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +package org.apache.kylin.dict; + +import org.apache.kylin.common.util.Dictionary; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Random; +import java.util.UUID; + +/** + * Created by xiefan on 16-12-28. + */ +@Ignore +public class TrieDictionaryForestBenchmark { + + private static final Random rand = new Random(System.currentTimeMillis()); + + private CacheDictionary<String> oldDict; + + private CacheDictionary<String> newDict; + + private ArrayList<String> rawData; + + private int cardnality = 100; + + private int testTimes = 100000; + + @Before + public void before() { + int dataSize = 100 * 10000; + TrieDictionaryBuilder<String> b1 = new TrieDictionaryBuilder<>(new StringBytesConverter()); + TrieDictionaryForestBuilder<String> b2 = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), 0, 5); + this.rawData = genStringDataSet(dataSize); + for (String str : this.rawData) { + b1.addValue(str); + b2.addValue(str); + } + this.oldDict = b1.build(0); + this.newDict = b2.build(); + System.out.println("new dict split tree size : " + ((TrieDictionaryForest<String>) newDict).getTrees().size()); + } + + @Test + public void testAll() { + benchmarkWithoutCache(); + benchmarkWithCache(); + } + + @Test + public void benchmarkWithoutCache() { + oldDict.disableCache(); + newDict.disableCache(); + runBenchmark("benchmarkWithoutCache"); + } + + @Test + public void benchmarkWithCache() { + oldDict.enableCache(); + newDict.enableCache(); + runBenchmark("benchmarkWithCache"); + } + + private void runBenchmark(String testName) { + long oldTime = runQueryValue(oldDict, cardnality, testTimes); + long oldTime2 = runQueryId(rawData, oldDict, cardnality, testTimes); + long oldTime3 = runQueryValueBytes(oldDict, cardnality, testTimes); + long oldTime4 = runQueryValueBytes2(oldDict, cardnality, testTimes); + long oldTime5 = runQueryIdByValueBytes(rawData, oldDict, cardnality, testTimes); + long newTime = runQueryValue(newDict, cardnality, testTimes); + long newTime2 = runQueryId(rawData, newDict, cardnality, testTimes); + long newTime3 = runQueryValueBytes(newDict, cardnality, testTimes); + long newTime4 = runQueryValueBytes2(newDict, cardnality, testTimes); + long newTime5 = runQueryIdByValueBytes(rawData, newDict, cardnality, testTimes); + System.out.println(testName); + System.out.println("old dict value --> id : " + oldTime2); + System.out.println("new dict value --> id :" + newTime2); + System.out.println("old dict value bytes --> id : " + oldTime5); + System.out.println("new dict value bytes--> id :" + newTime5); + System.out.println("old dict id --> value : " + oldTime); + System.out.println("new dict id --> value : " + newTime); + System.out.println("old dict id --> value bytes : " + oldTime3); + System.out.println("new dict id --> value bytes : " + newTime3); + System.out.println("old dict id --> value bytes (method 2): " + oldTime4); + System.out.println("new dict id --> value bytes (method 2): " + newTime4); + } + + //id -- value + private long runQueryValue(Dictionary<String> dict, int cardnality, int testTimes) { + long startTime = System.currentTimeMillis(); + int step = 1; + for (int i = 0; i < testTimes; i++) { + for (int j = 0; j < cardnality; j++) { + step |= dict.getValueFromId(j).length(); + } + } + return System.currentTimeMillis() - startTime; + } + + private long runQueryValueBytes(Dictionary<String> dict, int cardnality, int testTimes) { + long startTime = System.currentTimeMillis(); + int step = 1; + for (int i = 0; i < testTimes; i++) { + for (int j = 0; j < cardnality; j++) { + step |= dict.getValueBytesFromId(j).length; + } + } + return System.currentTimeMillis() - startTime; + } + + private long runQueryValueBytes2(Dictionary<String> dict, int cardnality, int testTimes) { + long startTime = System.currentTimeMillis(); + int step = 1; + byte[] returnValue = new byte[2048]; + for (int i = 0; i < testTimes; i++) { + for (int j = 0; j < cardnality; j++) { + int size = dict.getValueBytesFromId(j, returnValue, 0); + step |= size; + } + } + return System.currentTimeMillis() - startTime; + } + + private long runQueryId(ArrayList<String> rawData, Dictionary<String> dict, int cardnality, int testTimes) { + long startTime = System.currentTimeMillis(); + int step = 1; + for (int i = 0; i < testTimes; i++) { + for (int j = 0; j < cardnality; j++) { + step |= dict.getIdFromValue(rawData.get(j)); + } + } + return System.currentTimeMillis() - startTime; + } + + private long runQueryIdByValueBytes(ArrayList<String> rawData, Dictionary<String> dict, int cardnality, int testTimes) { + List<byte[]> testBytes = new ArrayList<>(); + StringBytesConverter converter = new StringBytesConverter(); + for (int i = 0; i < cardnality; i++) { + testBytes.add(converter.convertToBytes(rawData.get(i))); + } + long startTime = System.currentTimeMillis(); + int step = 1; + for (int i = 0; i < testTimes; i++) { + for (int j = 0; j < cardnality; j++) { + step |= dict.getIdFromValueBytes(testBytes.get(j), 0, testBytes.get(j).length); + } + } + return System.currentTimeMillis() - startTime; + } + + private ArrayList<String> genStringDataSet(int totalSize) { + ArrayList<String> data = new ArrayList<>(); + for (int i = 0; i < totalSize; i++) { + data.add(UUID.randomUUID().toString()); + } + Collections.sort(data); + return data; + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/1d6a36bf/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java ---------------------------------------------------------------------- diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java index ee092c9..68cf301 100755 --- a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java +++ b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java @@ -58,7 +58,7 @@ public class TrieDictionaryForestTest { TrieDictionaryForest<String> dict = builder.build(); assertSameBehaviorAsTrie(dict, strs, 0); } - + @Test public void testBasicFound() { ArrayList<String> strs = new ArrayList<String>(); @@ -106,7 +106,7 @@ public class TrieDictionaryForestTest { assertEquals(expectId, dict.getIdFromValue(s)); expectId++; } - + assertSameBehaviorAsTrie(dict, strs, baseId); } @@ -128,7 +128,7 @@ public class TrieDictionaryForestTest { assertEquals(255, id); id = dict.getIdFromValue(null, -1); assertEquals(255, id); - + assertSameBehaviorAsTrie(dict, strs, 0); } @@ -263,7 +263,6 @@ public class TrieDictionaryForestTest { } @Test - @Ignore public void categoryNamesTest() throws Exception { InputStream is = new FileInputStream("src/test/resources/dict/dw_category_grouping_names.dat"); ArrayList<String> str = loadStrings(is); @@ -284,19 +283,19 @@ public class TrieDictionaryForestTest { } @Test - public void emptyDictTest() throws Exception{ + public void emptyDictTest() throws Exception { TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter()); TrieDictionaryForest<String> dict = b.build(); - try{ + try { int id = dict.getIdFromValue("123", 0); fail("id should not exist"); - }catch (IllegalArgumentException e){ + } catch (IllegalArgumentException e) { //right } - try{ + try { String value = dict.getValueFromIdImpl(123); fail("value should not exist"); - }catch (IllegalArgumentException e){ + } catch (IllegalArgumentException e) { //right } } @@ -732,12 +731,6 @@ public class TrieDictionaryForestTest { System.out.println("compare build time. Old trie : " + oldDictTotalBuildTime / 1000.0 + "s.New trie : " + newDictTotalBuildTime / 1000.0 + "s"); } - @Test - public void queryTimeBenchmarkTest() throws Exception { - int count = (int) (Integer.MAX_VALUE * 0.8 / 640); - benchmarkStringDictionary(new RandomStrings(count)); - } - private void evaluateDataSize(ArrayList<String> list) { long size = 0; for (String str : list) @@ -867,7 +860,6 @@ public class TrieDictionaryForestTest { int baseId = new Random().nextInt(100); TrieDictionaryForestBuilder<String> b = newDictBuilder(str, baseId, 2); TrieDictionaryForest<String> dict = b.build(); - //dict.dump(System.out); TreeSet<String> set = new TreeSet<String>(); for (String s : str) { set.add(s); @@ -881,7 +873,7 @@ public class TrieDictionaryForestTest { int id = baseId; for (; it.hasNext(); id++) { String value = it.next(); - // System.out.println("checking " + id + " <==> " + value); + //System.out.println("checking " + id + " <==> " + value); assertEquals(id, dict.getIdFromValue(value)); assertEquals(value, dict.getValueFromId(id)); @@ -892,7 +884,7 @@ public class TrieDictionaryForestTest { for (String s : notFound) { try { int nullId = dict.getIdFromValue(s); - System.out.println("null value id:" + nullId); + //System.out.println("null value id:" + nullId); fail("For not found value '" + s + "', IllegalArgumentException is expected"); } catch (IllegalArgumentException e) { // good @@ -938,8 +930,9 @@ public class TrieDictionaryForestTest { public static TrieDictionaryForestBuilder<String> newDictBuilder(Iterable<String> strs, int baseId, int treeSize) { TrieDictionaryForestBuilder<String> b = new TrieDictionaryForestBuilder<String>(new StringBytesConverter(), baseId); b.setMaxTrieTreeSize(treeSize); - for (String s : strs) + for (String s : strs) { b.addValue(s); + } return b; } @@ -950,7 +943,7 @@ public class TrieDictionaryForestTest { b.addValue(strs.next()); return b; } - + private static class RandomStrings implements Iterable<String> { final private int size; @@ -1039,7 +1032,7 @@ public class TrieDictionaryForestTest { trieBuilder.addValue(s); } TrieDictionary<String> trie = trieBuilder.build(baseId); - + assertEquals(trie.getMaxId(), dict.getMaxId()); assertEquals(trie.getMinId(), dict.getMinId()); assertEquals(trie.getSize(), dict.getSize());
