Repository: sentry Updated Branches: refs/heads/master b5fadbb1e -> d9d5b4e49
SENTRY-1827: Minimize TPathsDump thrift message used in HDFS sync (Misha Dmitriev, reviewed by Alex Kolbasov) Project: http://git-wip-us.apache.org/repos/asf/sentry/repo Commit: http://git-wip-us.apache.org/repos/asf/sentry/commit/d9d5b4e4 Tree: http://git-wip-us.apache.org/repos/asf/sentry/tree/d9d5b4e4 Diff: http://git-wip-us.apache.org/repos/asf/sentry/diff/d9d5b4e4 Branch: refs/heads/master Commit: d9d5b4e49e18f8f6a6b20833196f990bd2acbf37 Parents: b5fadbb Author: Alexander Kolbasov <[email protected]> Authored: Mon Jul 10 22:42:10 2017 +0200 Committer: Alexander Kolbasov <[email protected]> Committed: Mon Jul 10 22:42:10 2017 +0200 ---------------------------------------------------------------------- .../sentry/hdfs/service/thrift/TPathsDump.java | 193 +++++++++++++++++-- .../apache/sentry/hdfs/AuthzPathsDumper.java | 16 +- .../org/apache/sentry/hdfs/HMSPathsDumper.java | 178 +++++++++++++++-- .../sentry/hdfs/UpdateableAuthzPaths.java | 7 +- .../main/resources/sentry_hdfs_service.thrift | 1 + .../sentry/hdfs/TestHMSPathsFullDump.java | 63 ++++-- .../sentry/hdfs/TestUpdateableAuthzPaths.java | 4 +- 7 files changed, 410 insertions(+), 52 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/sentry/blob/d9d5b4e4/sentry-hdfs/sentry-hdfs-common/src/gen/thrift/gen-javabean/org/apache/sentry/hdfs/service/thrift/TPathsDump.java ---------------------------------------------------------------------- diff --git a/sentry-hdfs/sentry-hdfs-common/src/gen/thrift/gen-javabean/org/apache/sentry/hdfs/service/thrift/TPathsDump.java b/sentry-hdfs/sentry-hdfs-common/src/gen/thrift/gen-javabean/org/apache/sentry/hdfs/service/thrift/TPathsDump.java index c1b42a0..df5b7b1 100644 --- a/sentry-hdfs/sentry-hdfs-common/src/gen/thrift/gen-javabean/org/apache/sentry/hdfs/service/thrift/TPathsDump.java +++ b/sentry-hdfs/sentry-hdfs-common/src/gen/thrift/gen-javabean/org/apache/sentry/hdfs/service/thrift/TPathsDump.java @@ -34,12 +34,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; @SuppressWarnings({"cast", "rawtypes", "serial", "unchecked"}) -@Generated(value = "Autogenerated by Thrift Compiler (0.9.3)", date = "2016-05-05") +@Generated(value = "Autogenerated by Thrift Compiler (0.9.3)", date = "2017-06-30") public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDump._Fields>, java.io.Serializable, Cloneable, Comparable<TPathsDump> { private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("TPathsDump"); private static final org.apache.thrift.protocol.TField ROOT_ID_FIELD_DESC = new org.apache.thrift.protocol.TField("rootId", org.apache.thrift.protocol.TType.I32, (short)1); private static final org.apache.thrift.protocol.TField NODE_MAP_FIELD_DESC = new org.apache.thrift.protocol.TField("nodeMap", org.apache.thrift.protocol.TType.MAP, (short)2); + private static final org.apache.thrift.protocol.TField DUP_STRING_VALUES_FIELD_DESC = new org.apache.thrift.protocol.TField("dupStringValues", org.apache.thrift.protocol.TType.LIST, (short)3); private static final Map<Class<? extends IScheme>, SchemeFactory> schemes = new HashMap<Class<? extends IScheme>, SchemeFactory>(); static { @@ -49,11 +50,13 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum private int rootId; // required private Map<Integer,TPathEntry> nodeMap; // required + private List<String> dupStringValues; // optional /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */ public enum _Fields implements org.apache.thrift.TFieldIdEnum { ROOT_ID((short)1, "rootId"), - NODE_MAP((short)2, "nodeMap"); + NODE_MAP((short)2, "nodeMap"), + DUP_STRING_VALUES((short)3, "dupStringValues"); private static final Map<String, _Fields> byName = new HashMap<String, _Fields>(); @@ -72,6 +75,8 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum return ROOT_ID; case 2: // NODE_MAP return NODE_MAP; + case 3: // DUP_STRING_VALUES + return DUP_STRING_VALUES; default: return null; } @@ -114,6 +119,7 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum // isset id assignments private static final int __ROOTID_ISSET_ID = 0; private byte __isset_bitfield = 0; + private static final _Fields optionals[] = {_Fields.DUP_STRING_VALUES}; public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap; static { Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class); @@ -123,6 +129,9 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum new org.apache.thrift.meta_data.MapMetaData(org.apache.thrift.protocol.TType.MAP, new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32), new org.apache.thrift.meta_data.StructMetaData(org.apache.thrift.protocol.TType.STRUCT, TPathEntry.class)))); + tmpMap.put(_Fields.DUP_STRING_VALUES, new org.apache.thrift.meta_data.FieldMetaData("dupStringValues", org.apache.thrift.TFieldRequirementType.OPTIONAL, + new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, + new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)))); metaDataMap = Collections.unmodifiableMap(tmpMap); org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(TPathsDump.class, metaDataMap); } @@ -161,6 +170,10 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum } this.nodeMap = __this__nodeMap; } + if (other.isSetDupStringValues()) { + List<String> __this__dupStringValues = new ArrayList<String>(other.dupStringValues); + this.dupStringValues = __this__dupStringValues; + } } public TPathsDump deepCopy() { @@ -172,6 +185,7 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum setRootIdIsSet(false); this.rootId = 0; this.nodeMap = null; + this.dupStringValues = null; } public int getRootId() { @@ -230,6 +244,44 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum } } + public int getDupStringValuesSize() { + return (this.dupStringValues == null) ? 0 : this.dupStringValues.size(); + } + + public java.util.Iterator<String> getDupStringValuesIterator() { + return (this.dupStringValues == null) ? null : this.dupStringValues.iterator(); + } + + public void addToDupStringValues(String elem) { + if (this.dupStringValues == null) { + this.dupStringValues = new ArrayList<String>(); + } + this.dupStringValues.add(elem); + } + + public List<String> getDupStringValues() { + return this.dupStringValues; + } + + public void setDupStringValues(List<String> dupStringValues) { + this.dupStringValues = dupStringValues; + } + + public void unsetDupStringValues() { + this.dupStringValues = null; + } + + /** Returns true if field dupStringValues is set (has been assigned a value) and false otherwise */ + public boolean isSetDupStringValues() { + return this.dupStringValues != null; + } + + public void setDupStringValuesIsSet(boolean value) { + if (!value) { + this.dupStringValues = null; + } + } + public void setFieldValue(_Fields field, Object value) { switch (field) { case ROOT_ID: @@ -248,6 +300,14 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum } break; + case DUP_STRING_VALUES: + if (value == null) { + unsetDupStringValues(); + } else { + setDupStringValues((List<String>)value); + } + break; + } } @@ -259,6 +319,9 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum case NODE_MAP: return getNodeMap(); + case DUP_STRING_VALUES: + return getDupStringValues(); + } throw new IllegalStateException(); } @@ -274,6 +337,8 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum return isSetRootId(); case NODE_MAP: return isSetNodeMap(); + case DUP_STRING_VALUES: + return isSetDupStringValues(); } throw new IllegalStateException(); } @@ -309,6 +374,15 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum return false; } + boolean this_present_dupStringValues = true && this.isSetDupStringValues(); + boolean that_present_dupStringValues = true && that.isSetDupStringValues(); + if (this_present_dupStringValues || that_present_dupStringValues) { + if (!(this_present_dupStringValues && that_present_dupStringValues)) + return false; + if (!this.dupStringValues.equals(that.dupStringValues)) + return false; + } + return true; } @@ -326,6 +400,11 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum if (present_nodeMap) list.add(nodeMap); + boolean present_dupStringValues = true && (isSetDupStringValues()); + list.add(present_dupStringValues); + if (present_dupStringValues) + list.add(dupStringValues); + return list.hashCode(); } @@ -357,6 +436,16 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum return lastComparison; } } + lastComparison = Boolean.valueOf(isSetDupStringValues()).compareTo(other.isSetDupStringValues()); + if (lastComparison != 0) { + return lastComparison; + } + if (isSetDupStringValues()) { + lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.dupStringValues, other.dupStringValues); + if (lastComparison != 0) { + return lastComparison; + } + } return 0; } @@ -388,6 +477,16 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum sb.append(this.nodeMap); } first = false; + if (isSetDupStringValues()) { + if (!first) sb.append(", "); + sb.append("dupStringValues:"); + if (this.dupStringValues == null) { + sb.append("null"); + } else { + sb.append(this.dupStringValues); + } + first = false; + } sb.append(")"); return sb.toString(); } @@ -470,6 +569,24 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type); } break; + case 3: // DUP_STRING_VALUES + if (schemeField.type == org.apache.thrift.protocol.TType.LIST) { + { + org.apache.thrift.protocol.TList _list52 = iprot.readListBegin(); + struct.dupStringValues = new ArrayList<String>(_list52.size); + String _elem53; + for (int _i54 = 0; _i54 < _list52.size; ++_i54) + { + _elem53 = iprot.readString(); + struct.dupStringValues.add(_elem53); + } + iprot.readListEnd(); + } + struct.setDupStringValuesIsSet(true); + } else { + org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type); + } + break; default: org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type); } @@ -490,15 +607,29 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum oprot.writeFieldBegin(NODE_MAP_FIELD_DESC); { oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.I32, org.apache.thrift.protocol.TType.STRUCT, struct.nodeMap.size())); - for (Map.Entry<Integer, TPathEntry> _iter52 : struct.nodeMap.entrySet()) + for (Map.Entry<Integer, TPathEntry> _iter55 : struct.nodeMap.entrySet()) { - oprot.writeI32(_iter52.getKey()); - _iter52.getValue().write(oprot); + oprot.writeI32(_iter55.getKey()); + _iter55.getValue().write(oprot); } oprot.writeMapEnd(); } oprot.writeFieldEnd(); } + if (struct.dupStringValues != null) { + if (struct.isSetDupStringValues()) { + oprot.writeFieldBegin(DUP_STRING_VALUES_FIELD_DESC); + { + oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, struct.dupStringValues.size())); + for (String _iter56 : struct.dupStringValues) + { + oprot.writeString(_iter56); + } + oprot.writeListEnd(); + } + oprot.writeFieldEnd(); + } + } oprot.writeFieldStop(); oprot.writeStructEnd(); } @@ -519,10 +650,24 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum oprot.writeI32(struct.rootId); { oprot.writeI32(struct.nodeMap.size()); - for (Map.Entry<Integer, TPathEntry> _iter53 : struct.nodeMap.entrySet()) + for (Map.Entry<Integer, TPathEntry> _iter57 : struct.nodeMap.entrySet()) + { + oprot.writeI32(_iter57.getKey()); + _iter57.getValue().write(oprot); + } + } + BitSet optionals = new BitSet(); + if (struct.isSetDupStringValues()) { + optionals.set(0); + } + oprot.writeBitSet(optionals, 1); + if (struct.isSetDupStringValues()) { { - oprot.writeI32(_iter53.getKey()); - _iter53.getValue().write(oprot); + oprot.writeI32(struct.dupStringValues.size()); + for (String _iter58 : struct.dupStringValues) + { + oprot.writeString(_iter58); + } } } } @@ -533,19 +678,33 @@ public class TPathsDump implements org.apache.thrift.TBase<TPathsDump, TPathsDum struct.rootId = iprot.readI32(); struct.setRootIdIsSet(true); { - org.apache.thrift.protocol.TMap _map54 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.I32, org.apache.thrift.protocol.TType.STRUCT, iprot.readI32()); - struct.nodeMap = new HashMap<Integer,TPathEntry>(2*_map54.size); - int _key55; - TPathEntry _val56; - for (int _i57 = 0; _i57 < _map54.size; ++_i57) + org.apache.thrift.protocol.TMap _map59 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.I32, org.apache.thrift.protocol.TType.STRUCT, iprot.readI32()); + struct.nodeMap = new HashMap<Integer,TPathEntry>(2*_map59.size); + int _key60; + TPathEntry _val61; + for (int _i62 = 0; _i62 < _map59.size; ++_i62) { - _key55 = iprot.readI32(); - _val56 = new TPathEntry(); - _val56.read(iprot); - struct.nodeMap.put(_key55, _val56); + _key60 = iprot.readI32(); + _val61 = new TPathEntry(); + _val61.read(iprot); + struct.nodeMap.put(_key60, _val61); } } struct.setNodeMapIsSet(true); + BitSet incoming = iprot.readBitSet(1); + if (incoming.get(0)) { + { + org.apache.thrift.protocol.TList _list63 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32()); + struct.dupStringValues = new ArrayList<String>(_list63.size); + String _elem64; + for (int _i65 = 0; _i65 < _list63.size; ++_i65) + { + _elem64 = iprot.readString(); + struct.dupStringValues.add(_elem64); + } + } + struct.setDupStringValuesIsSet(true); + } } } http://git-wip-us.apache.org/repos/asf/sentry/blob/d9d5b4e4/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/AuthzPathsDumper.java ---------------------------------------------------------------------- diff --git a/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/AuthzPathsDumper.java b/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/AuthzPathsDumper.java index 0950957..eb47046 100644 --- a/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/AuthzPathsDumper.java +++ b/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/AuthzPathsDumper.java @@ -15,14 +15,28 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.sentry.hdfs; import org.apache.sentry.hdfs.service.thrift.TPathsDump; public interface AuthzPathsDumper<K extends AuthzPaths> { - TPathsDump createPathsDump(); + /** + * Creates a TPathsDump thrift object from the data in memory (K instance). + * + * @param minimizeSize if true, the code will make an effort to minimize the + * size of the serialized message by, for example, + * performing customized interning of duplicate strings. + * So far this is optional since, in particular, messages + * created with minimizeSize == false are compatible with + * the older TPathsDump messages. + */ + TPathsDump createPathsDump(boolean minimizeSize); + /** + * Creates data in memory (an instance of K) from TPathsDump thrift object. + */ K initializeFromDump(TPathsDump pathsDump); } http://git-wip-us.apache.org/repos/asf/sentry/blob/d9d5b4e4/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/HMSPathsDumper.java ---------------------------------------------------------------------- diff --git a/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/HMSPathsDumper.java b/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/HMSPathsDumper.java index 480a29d..a233b74 100644 --- a/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/HMSPathsDumper.java +++ b/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/HMSPathsDumper.java @@ -17,9 +17,11 @@ */ package org.apache.sentry.hdfs; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; @@ -47,32 +49,46 @@ public class HMSPathsDumper implements AuthzPathsDumper<HMSPaths> { } @Override - public TPathsDump createPathsDump() { + public TPathsDump createPathsDump(boolean minimizeSize) { + DupDetector dups = null; + if (minimizeSize) { + dups = new DupDetector(); + dups.detectDupPathElements(hmsPaths.getRootEntry()); + } + AtomicInteger counter = new AtomicInteger(0); Map<Integer, TPathEntry> idMap = new HashMap<Integer, TPathEntry>(); Tuple tRootTuple = - createTPathEntry(hmsPaths.getRootEntry(), counter, idMap); + createTPathEntry(hmsPaths.getRootEntry(), counter, idMap, dups); idMap.put(tRootTuple.id, tRootTuple.entry); - cloneToTPathEntry(hmsPaths.getRootEntry(), tRootTuple.entry, counter, idMap); - return new TPathsDump(tRootTuple.id, idMap); + cloneToTPathEntry( + hmsPaths.getRootEntry(), tRootTuple.entry, counter, idMap, dups); + TPathsDump dump = new TPathsDump(tRootTuple.id, idMap); + + if (minimizeSize) { + dump.setDupStringValues(Arrays.asList(dups.getDupStringValues())); + } + return dump; } private void cloneToTPathEntry(Entry parent, TPathEntry tParent, - AtomicInteger counter, Map<Integer, TPathEntry> idMap) { + AtomicInteger counter, Map<Integer, TPathEntry> idMap, DupDetector dups) { for (Entry child : parent.childrenValues()) { - Tuple childTuple = createTPathEntry(child, counter, idMap); + Tuple childTuple = createTPathEntry(child, counter, idMap, dups); tParent.addToChildren(childTuple.id); - cloneToTPathEntry(child, childTuple.entry, counter, idMap); + cloneToTPathEntry(child, childTuple.entry, counter, idMap, dups); } } private Tuple createTPathEntry(Entry entry, AtomicInteger idCounter, - Map<Integer, TPathEntry> idMap) { + Map<Integer, TPathEntry> idMap, DupDetector dups) { int myId = idCounter.incrementAndGet(); Set<Integer> children = entry.hasChildren() ? new HashSet<Integer>(entry.numChildren()) : Collections.<Integer>emptySet(); - TPathEntry tEntry = new TPathEntry(entry.getType().getByte(), - entry.getPathElement(), children); + String pathElement = entry.getPathElement(); + String sameOrReplacementId = + dups != null ? dups.getReplacementString(pathElement) : pathElement; + TPathEntry tEntry = new TPathEntry(entry.getType().getByte(), sameOrReplacementId, children); if (!entry.getAuthzObjs().isEmpty()) { tEntry.setAuthzObjs(entry.getAuthzObjs()); } @@ -87,7 +103,7 @@ public class HMSPathsDumper implements AuthzPathsDumper<HMSPaths> { Entry rootEntry = newHmsPaths.getRootEntry(); Map<String, Set<Entry>> authzObjToPath = new HashMap<String, Set<Entry>>(); cloneToEntry(tRootEntry, rootEntry, pathDump.getNodeMap(), authzObjToPath, - rootEntry.getType() == EntryType.PREFIX); + pathDump.getDupStringValues(), rootEntry.getType() == EntryType.PREFIX); newHmsPaths.setRootEntry(rootEntry); newHmsPaths.setAuthzObjToPathMapping(authzObjToPath); @@ -95,15 +111,22 @@ public class HMSPathsDumper implements AuthzPathsDumper<HMSPaths> { } private void cloneToEntry(TPathEntry tParent, Entry parent, - Map<Integer, TPathEntry> idMap, Map<String, - Set<Entry>> authzObjToPath, boolean hasCrossedPrefix) { + Map<Integer, TPathEntry> idMap, Map<String, Set<Entry>> authzObjToPath, + List<String> dupStringValues, boolean hasCrossedPrefix) { for (Integer id : tParent.getChildren()) { TPathEntry tChild = idMap.get(id); + + String tChildPathElement = tChild.getPathElement(); + if (tChildPathElement.charAt(0) == DupDetector.REPLACEMENT_STRING_PREFIX) { + int dupStrIdx = Integer.parseInt(tChildPathElement.substring(1), 16); + tChildPathElement = dupStringValues.get(dupStrIdx); + } + Entry child = null; boolean isChildPrefix = hasCrossedPrefix; if (!hasCrossedPrefix) { - child = parent.getChild(tChild.getPathElement()); - // If we havn't reached a prefix entry yet, then child should + child = parent.getChild(tChildPathElement); + // If we haven't reached a prefix entry yet, then child should // already exists.. else it is not part of the prefix if (child == null) { continue; @@ -116,7 +139,7 @@ public class HMSPathsDumper implements AuthzPathsDumper<HMSPaths> { } } if (child == null) { - child = new Entry(parent, tChild.getPathElement(), + child = new Entry(parent, tChildPathElement, EntryType.fromByte(tChild.getType()), tChild.getAuthzObjs()); } if (child.getAuthzObjs().size() != 0) { @@ -130,8 +153,129 @@ public class HMSPathsDumper implements AuthzPathsDumper<HMSPaths> { } } parent.putChild(child.getPathElement(), child); - cloneToEntry(tChild, child, idMap, authzObjToPath, isChildPrefix); + cloneToEntry(tChild, child, idMap, authzObjToPath, + dupStringValues, isChildPrefix); } } + /** + * This class wraps a customized hash map that allows us to detect (most of) + * the duplicate strings in the given tree of HMSPaths$Entry objects. The + * hash map has fixed size to avoid bloating memory, especially in the + * situation when there are many strings but little or no duplication. Fixed + * table size also means that the maximum length of replacement IDs that we + * return for duplicate strings, such as ":123", is relatively small, and thus + * they can be effectively used to substitute duplicate strings that are just + * slightly longer. The IDs are generated using a running index within the + * table, so they start from ":0", but eventually can get long if the table + * is too big. + * + * After calling methods in this class to detect duplicates and then to + * obtain a possible encoded substitute for each string, getDupStringValues() + * should be called to obtain the auxiliary string array, which contains + * the real values of encoded duplicate strings. + */ + private static class DupDetector { + // The prefix that we use to distinguish between real path element + // strings and replacement string IDs used for duplicate strings + static final char REPLACEMENT_STRING_PREFIX = ':'; + // Hash map size chosen as a compromise between not using too much memory + // and catching enough of duplicate strings. Should be a power of two. + private static final int TABLE_SIZE = 16 * 1024; + // We assume that an average replacement string looks like ":123". + // We don't encode strings shorter than this length, because it's likely + // that the resulting gain will be negative. + private static final int AVG_ID_LENGTH = 4; + // We replace a string in TPathsDump with an id only if it occurs in the + // message at least this number of times. + private static final int MIN_NUM_DUPLICATES = 2; + // Strings in TPathsDump that we check for duplication. Since our table has + // fixed size, strings with the same hashcode fall into the same table slot, + // and the string that was added last "wins". + private final String[] keys = new String[TABLE_SIZE]; + // During the analysis phase, each value is the number of occurrences + // of the respective key string. Then it's the position of that string + // in the serialized auxiliary string array. + private final int[] values = new int[TABLE_SIZE]; + // Size of the auxiliary string array - essentially the number of duplicate + // strings that we detected and encoded. + private int auxArraySize; + + /** + * Finds duplicate strings in the tree of Entry objects with the given root, + * and fills the internal hash map for subsequent use. + */ + void detectDupPathElements(Entry root) { + inspectEntry(root); + + // Iterate through the table, remove Strings that are not duplicate, + // and associate each duplicate one with its position in the final + // serialized auxiliary string array. + for (int i = 0; i < TABLE_SIZE; i++) { + if (keys[i] != null) { + if (values[i] >= MIN_NUM_DUPLICATES) { + values[i] = auxArraySize++; + } else { // No duplication for this string + keys[i] = null; + values[i] = -1; // Just to mark invalid slots + } + } + } + } + + /** + * For the given original string, returns a shorter substitute string ID, + * such as ":123". The ID starts with a symbol not allowed in normal HDFS + * pathElements, allowing us to later distinguish between normal and + * substitute pathElements. The ID (in hexadecimal format, to make IDs + * shorter) is the index of the original string in the array returned by + * getDupStringValues(). + * + * @param pathElement a string that may be duplicate + * @return if pathElement was previously found to be duplicate, returns + * the replacement ID as described above. Otherwise, returns the + * input string itself. + */ + String getReplacementString(String pathElement) { + int slot = pathElement.hashCode() & (TABLE_SIZE - 1); + return pathElement.equals(keys[slot]) ? + REPLACEMENT_STRING_PREFIX + Integer.toHexString(values[slot]) : pathElement; + } + + /** + * Returns the array of strings that have duplicates and therefore should + * be substituted with respective IDs. See {@link #getReplacementString} + */ + String[] getDupStringValues() { + String[] auxArray = new String[auxArraySize]; + int pos = 0; + for (int i = 0; i < TABLE_SIZE; i++) { + if (keys[i] != null) { + auxArray[pos++] = keys[i]; + } + } + return auxArray; + } + + private void inspectEntry(Entry entry) { + String pathElement = entry.getPathElement(); + if (pathElement.length() > AVG_ID_LENGTH) { + // In the serialized data, it doesn't make sense to replace string origS + // with idS if origS is shorter than the average length of idS. + int slot = pathElement.hashCode() & (TABLE_SIZE - 1); + if (pathElement.equals(keys[slot])) { + values[slot]++; + } else { + // This slot is currently empty, or there is a hash collision. + // Either way, put pathElement there and reset the entry. + keys[slot] = pathElement; + values[slot] = 1; + } + } + + for (Entry child : entry.childrenValues()) { + inspectEntry(child); + } + } + } } http://git-wip-us.apache.org/repos/asf/sentry/blob/d9d5b4e4/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/UpdateableAuthzPaths.java ---------------------------------------------------------------------- diff --git a/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/UpdateableAuthzPaths.java b/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/UpdateableAuthzPaths.java index ad7f8c9..fd5ec4a 100644 --- a/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/UpdateableAuthzPaths.java +++ b/sentry-hdfs/sentry-hdfs-common/src/main/java/org/apache/sentry/hdfs/UpdateableAuthzPaths.java @@ -147,7 +147,7 @@ public class UpdateableAuthzPaths implements AuthzPaths, Updateable<PathsUpdate> @Override public PathsUpdate createFullImageUpdate(long currSeqNum) { PathsUpdate pathsUpdate = new PathsUpdate(currSeqNum, true); - pathsUpdate.toThrift().setPathsDump(getPathsDump().createPathsDump()); + pathsUpdate.toThrift().setPathsDump(getPathsDump().createPathsDump(true)); return pathsUpdate; } @@ -156,8 +156,9 @@ public class UpdateableAuthzPaths implements AuthzPaths, Updateable<PathsUpdate> return new AuthzPathsDumper<UpdateableAuthzPaths>() { @Override - public TPathsDump createPathsDump() { - return UpdateableAuthzPaths.this.paths.getPathsDump().createPathsDump(); + public TPathsDump createPathsDump(boolean minimizeSize) { + return UpdateableAuthzPaths.this.paths.getPathsDump(). + createPathsDump(minimizeSize); } @Override http://git-wip-us.apache.org/repos/asf/sentry/blob/d9d5b4e4/sentry-hdfs/sentry-hdfs-common/src/main/resources/sentry_hdfs_service.thrift ---------------------------------------------------------------------- diff --git a/sentry-hdfs/sentry-hdfs-common/src/main/resources/sentry_hdfs_service.thrift b/sentry-hdfs/sentry-hdfs-common/src/main/resources/sentry_hdfs_service.thrift index d01841b..7e9eb25 100644 --- a/sentry-hdfs/sentry-hdfs-common/src/main/resources/sentry_hdfs_service.thrift +++ b/sentry-hdfs/sentry-hdfs-common/src/main/resources/sentry_hdfs_service.thrift @@ -58,6 +58,7 @@ struct TPathEntry { struct TPathsDump { 1: required i32 rootId; 2: required map<i32,TPathEntry> nodeMap; +3: optional list<string> dupStringValues; } struct TPathsUpdate { http://git-wip-us.apache.org/repos/asf/sentry/blob/d9d5b4e4/sentry-hdfs/sentry-hdfs-common/src/test/java/org/apache/sentry/hdfs/TestHMSPathsFullDump.java ---------------------------------------------------------------------- diff --git a/sentry-hdfs/sentry-hdfs-common/src/test/java/org/apache/sentry/hdfs/TestHMSPathsFullDump.java b/sentry-hdfs/sentry-hdfs-common/src/test/java/org/apache/sentry/hdfs/TestHMSPathsFullDump.java index 194ffb7..6a4e32f 100644 --- a/sentry-hdfs/sentry-hdfs-common/src/test/java/org/apache/sentry/hdfs/TestHMSPathsFullDump.java +++ b/sentry-hdfs/sentry-hdfs-common/src/test/java/org/apache/sentry/hdfs/TestHMSPathsFullDump.java @@ -32,12 +32,13 @@ import com.google.common.collect.Lists; import java.util.Arrays; import java.util.HashSet; import java.io.IOException; +import java.util.List; public class TestHMSPathsFullDump { private static boolean useCompact = true; @Test - public void testDumpAndInitialize() { + public void testDumpAndInitialize() throws Exception { HMSPaths hmsPaths = new HMSPaths(new String[] {"/user/hive/warehouse", "/user/hive/w2"}); hmsPaths._addAuthzObject("default", Lists.newArrayList("/user/hive/warehouse")); hmsPaths._addAuthzObject("db1", Lists.newArrayList("/user/hive/warehouse/db1")); @@ -45,7 +46,11 @@ public class TestHMSPathsFullDump { hmsPaths._addPathsToAuthzObject("db1.tbl11", Lists.newArrayList( "/user/hive/warehouse/db1/tbl11/part111", "/user/hive/warehouse/db1/tbl11/part112", - "/user/hive/warehouse/db1/tbl11/p1=1/p2=x")); + "/user/hive/warehouse/db1/tbl11/p1=1/p2=x", + "/user/hive/warehouse/db1/tbl11/part_duplicate1", + "/user/hive/warehouse/db1/tbl11/part_duplicate1/part_duplicate2", + "/user/hive/warehouse/db1/tbl11/part_duplicate2", + "/user/hive/warehouse/db1/tbl11/part_duplicate2/part_duplicate1")); // Not in Deserialized objects prefix paths hmsPaths._addAuthzObject("db2", Lists.newArrayList("/user/hive/w2/db2")); @@ -63,7 +68,34 @@ public class TestHMSPathsFullDump { Assert.assertEquals(new HashSet<String>(Arrays.asList("db2.tbl21")), hmsPaths.findAuthzObject(new String[]{"user", "hive", "w2", "db2", "tbl21", "p1=1"}, true)); HMSPathsDumper serDe = hmsPaths.getPathsDump(); - TPathsDump pathsDump = serDe.createPathsDump(); + TPathsDump pathsDump = serDe.createPathsDump(true); + + Assert.assertTrue(pathsDump.isSetDupStringValues()); + List<String> dupStringValues = pathsDump.getDupStringValues(); + Assert.assertEquals(2, dupStringValues.size()); + Assert.assertTrue(dupStringValues.contains("part_duplicate1")); + Assert.assertTrue(dupStringValues.contains("part_duplicate2")); + + checkDeserializedHmsPaths(pathsDump); + + TProtocolFactory protoFactory = getTProtocolFactory(); + byte[] serMinimized = new TSerializer(protoFactory).serialize(pathsDump); + + // Check that when no message size minimization is requested, everything still works + serDe = hmsPaths.getPathsDump(); + pathsDump = serDe.createPathsDump(false); + Assert.assertFalse(pathsDump.isSetDupStringValues()); + + checkDeserializedHmsPaths(pathsDump); + + byte[] serNormal = new TSerializer(protoFactory).serialize(pathsDump); + + System.out.println("minimized length = " + serMinimized.length + + ", normal length = " + serNormal.length); + Assert.assertTrue(serMinimized.length < serNormal.length); + } + + private void checkDeserializedHmsPaths(TPathsDump pathsDump) { HMSPaths hmsPaths2 = new HMSPaths(new String[] {"/user/hive/warehouse"}).getPathsDump().initializeFromDump(pathsDump); Assert.assertEquals(new HashSet<String>(Arrays.asList("default")), hmsPaths2.findAuthzObject(new String[]{"user", "hive", "warehouse"}, false)); @@ -71,6 +103,10 @@ public class TestHMSPathsFullDump { Assert.assertEquals(new HashSet<String>(Arrays.asList("db1.tbl11")), hmsPaths2.findAuthzObject(new String[]{"user", "hive", "warehouse", "db1", "tbl11"}, false)); Assert.assertEquals(new HashSet<String>(Arrays.asList("db1.tbl11")), hmsPaths2.findAuthzObject(new String[]{"user", "hive", "warehouse", "db1", "tbl11", "part111"}, false)); Assert.assertEquals(new HashSet<String>(Arrays.asList("db1.tbl11")), hmsPaths2.findAuthzObject(new String[]{"user", "hive", "warehouse", "db1", "tbl11", "part112"}, false)); + Assert.assertEquals(new HashSet<String>(Arrays.asList("db1.tbl11")), hmsPaths2.findAuthzObject(new String[]{"user", "hive", "warehouse", "db1", "tbl11", "part_duplicate1"}, false)); + Assert.assertEquals(new HashSet<String>(Arrays.asList("db1.tbl11")), hmsPaths2.findAuthzObject(new String[]{"user", "hive", "warehouse", "db1", "tbl11", "part_duplicate1", "part_duplicate2"}, false)); + Assert.assertEquals(new HashSet<String>(Arrays.asList("db1.tbl11")), hmsPaths2.findAuthzObject(new String[]{"user", "hive", "warehouse", "db1", "tbl11", "part_duplicate2"}, false)); + Assert.assertEquals(new HashSet<String>(Arrays.asList("db1.tbl11")), hmsPaths2.findAuthzObject(new String[]{"user", "hive", "warehouse", "db1", "tbl11", "part_duplicate2", "part_duplicate1"}, false)); // This path is not under prefix, so should not be deserialized.. Assert.assertNull(hmsPaths2.findAuthzObject(new String[]{"user", "hive", "w2", "db2", "tbl21", "p1=1"}, true)); @@ -80,14 +116,9 @@ public class TestHMSPathsFullDump { public void testThrftSerialization() throws TException { HMSPathsDumper serDe = genHMSPathsDumper(); long t1 = System.currentTimeMillis(); - TPathsDump pathsDump = serDe.createPathsDump(); - - TProtocolFactory protoFactory = useCompact ? new TCompactProtocol.Factory( - ServiceConstants.ClientConfig.SENTRY_HDFS_THRIFT_MAX_MESSAGE_SIZE_DEFAULT, - ServiceConstants.ClientConfig.SENTRY_HDFS_THRIFT_MAX_MESSAGE_SIZE_DEFAULT) - : new TBinaryProtocol.Factory(true, true, - ServiceConstants.ClientConfig.SENTRY_HDFS_THRIFT_MAX_MESSAGE_SIZE_DEFAULT, - ServiceConstants.ClientConfig.SENTRY_HDFS_THRIFT_MAX_MESSAGE_SIZE_DEFAULT); + TPathsDump pathsDump = serDe.createPathsDump(true); + + TProtocolFactory protoFactory = getTProtocolFactory(); byte[] ser = new TSerializer(protoFactory).serialize(pathsDump); long serTime = System.currentTimeMillis() - t1; System.out.println("Serialization Time: " + serTime + ", " + ser.length); @@ -107,7 +138,7 @@ public class TestHMSPathsFullDump { @Test public void testThriftSerializerWithInvalidMsgSize() throws TException, IOException { HMSPathsDumper serDe = genHMSPathsDumper(); - TPathsDump pathsDump = serDe.createPathsDump(); + TPathsDump pathsDump = serDe.createPathsDump(true); byte[] ser =ThriftSerializer.serialize(pathsDump); boolean exceptionThrown = false; @@ -152,4 +183,12 @@ public class TestHMSPathsFullDump { return hmsPaths.getPathsDump(); } + private TProtocolFactory getTProtocolFactory() { + return useCompact ? new TCompactProtocol.Factory( + ServiceConstants.ClientConfig.SENTRY_HDFS_THRIFT_MAX_MESSAGE_SIZE_DEFAULT, + ServiceConstants.ClientConfig.SENTRY_HDFS_THRIFT_MAX_MESSAGE_SIZE_DEFAULT) + : new TBinaryProtocol.Factory(true, true, + ServiceConstants.ClientConfig.SENTRY_HDFS_THRIFT_MAX_MESSAGE_SIZE_DEFAULT, + ServiceConstants.ClientConfig.SENTRY_HDFS_THRIFT_MAX_MESSAGE_SIZE_DEFAULT); + } } http://git-wip-us.apache.org/repos/asf/sentry/blob/d9d5b4e4/sentry-hdfs/sentry-hdfs-common/src/test/java/org/apache/sentry/hdfs/TestUpdateableAuthzPaths.java ---------------------------------------------------------------------- diff --git a/sentry-hdfs/sentry-hdfs-common/src/test/java/org/apache/sentry/hdfs/TestUpdateableAuthzPaths.java b/sentry-hdfs/sentry-hdfs-common/src/test/java/org/apache/sentry/hdfs/TestUpdateableAuthzPaths.java index e643e01..ee809d8 100644 --- a/sentry-hdfs/sentry-hdfs-common/src/test/java/org/apache/sentry/hdfs/TestUpdateableAuthzPaths.java +++ b/sentry-hdfs/sentry-hdfs-common/src/test/java/org/apache/sentry/hdfs/TestUpdateableAuthzPaths.java @@ -38,7 +38,7 @@ public class TestUpdateableAuthzPaths { UpdateableAuthzPaths authzPaths = new UpdateableAuthzPaths(hmsPaths); PathsUpdate update = new PathsUpdate(1, true); - update.toThrift().setPathsDump(authzPaths.getPathsDump().createPathsDump()); + update.toThrift().setPathsDump(authzPaths.getPathsDump().createPathsDump(true)); UpdateableAuthzPaths authzPaths2 = new UpdateableAuthzPaths(new String[] {"/"}); UpdateableAuthzPaths pre = authzPaths2.updateFull(update); @@ -53,7 +53,7 @@ public class TestUpdateableAuthzPaths { // Ensure Full Update wipes old stuff UpdateableAuthzPaths authzPaths3 = new UpdateableAuthzPaths(createBaseHMSPaths(2, 1)); update = new PathsUpdate(2, true); - update.toThrift().setPathsDump(authzPaths3.getPathsDump().createPathsDump()); + update.toThrift().setPathsDump(authzPaths3.getPathsDump().createPathsDump(true)); pre = authzPaths2.updateFull(update); assertFalse(pre == authzPaths2); authzPaths2 = pre;
