siddharthteotia commented on a change in pull request #5061: Improvements to data anonymizer tool URL: https://github.com/apache/incubator-pinot/pull/5061#discussion_r377274727
########## File path: pinot-tools/src/main/java/org/apache/pinot/tools/anonymizer/MapBasedGlobalDictionaries.java ########## @@ -0,0 +1,278 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.tools.anonymizer; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.PrintWriter; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.TreeMap; +import org.apache.commons.lang.RandomStringUtils; +import org.apache.pinot.core.segment.index.ColumnMetadata; +import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.utils.ByteArray; + + +public class MapBasedGlobalDictionaries implements GlobalDictionaries { + private final static int INT_BASE_VALUE = 1000; + private final static long LONG_BASE_VALUE = 100000; + private static final float FLOAT_BASE_VALUE = 100.23f; + private static final double DOUBLE_BASE_VALUE = 1000.2375; + + private final Map<String, OrigAndDerivedValueHolder> _columnToGlobalDictionary; + + MapBasedGlobalDictionaries() { + _columnToGlobalDictionary = new HashMap<>(); + } + + /** + * First step towards building global dictionary + * by inserting the original values from segments + * into global dictionary + * @param origValue original value + * @param column column name + * @param columnMetadata column metadata + * @param cardinality total cardinality of column + */ + @Override + public void addOrigValueToGlobalDictionary( + Object origValue, + String column, + ColumnMetadata columnMetadata, + int cardinality) { + FieldSpec.DataType dataType = columnMetadata.getDataType(); + _columnToGlobalDictionary.putIfAbsent(column, new OrigAndDerivedValueHolder(dataType)); + OrigAndDerivedValueHolder origAndDerivedValueHolder = _columnToGlobalDictionary.get(column); + if (dataType == FieldSpec.DataType.BYTES) { + origAndDerivedValueHolder.setOrigValue(new ByteArray((byte[])origValue)); + } else { + origAndDerivedValueHolder.setOrigValue(origValue); + } + } + + /** + * This is the second step where we complete the global dictionaries + * by sorting the original values to get sort order across all segments + */ + @Override + public void sortOriginalValuesInGlobalDictionaries() { + // NO-OP since we use a sorted map so the global dictionary + // is already sorted + } + + /** + * This is the third and final step where we complete the global + * dictionaries by generating values: + * + * For numeric columns, we generate in order since + * we start with a base value. + * For string column, we first generate and then sort + */ + @Override + public void addDerivedValuesToGlobalDictionaries() { + // update global dictionary for each column by adding + // the corresponding generated value for each orig value + for (Map.Entry<String, OrigAndDerivedValueHolder> entry : _columnToGlobalDictionary.entrySet()) { + OrigAndDerivedValueHolder origAndDerivedValueHolder = entry.getValue(); + generateDerivedValuesForGlobalDictionary(origAndDerivedValueHolder); + } + } + + @Override + public void serialize(String outputDir) throws Exception { + // write global dictionary for each column + for (String column : _columnToGlobalDictionary.keySet()) { + PrintWriter dictionaryWriter = new PrintWriter(new BufferedWriter(new FileWriter(outputDir + "/" + column + DICT_FILE_EXTENSION))); + OrigAndDerivedValueHolder origAndDerivedValueHolder = _columnToGlobalDictionary.get(column); + Set<Map.Entry<Object, DerivedValue>> entries = origAndDerivedValueHolder._origAndDerivedValues.entrySet(); + Iterator<Map.Entry<Object, DerivedValue>> sortedIterator = entries.iterator(); + while (sortedIterator.hasNext()) { + Map.Entry<Object, DerivedValue> entry = sortedIterator.next(); + dictionaryWriter.println(entry.getKey()); + dictionaryWriter.println(entry.getValue()._derivedValue); + } + dictionaryWriter.flush(); + } + } + + @Override + public Object getDerivedValueForOrigValueSV(String column, Object origValue) { + OrigAndDerivedValueHolder origAndDerivedValueHolder = _columnToGlobalDictionary.get(column); + TreeMap<Object, DerivedValue> sortedMap = origAndDerivedValueHolder._origAndDerivedValues; + return sortedMap.get(origValue)._derivedValue; + } + + @Override + public Object[] getDerivedValuesForOrigValuesMV(String column, Object[] origMultiValues) { + OrigAndDerivedValueHolder origAndDerivedValueHolder = _columnToGlobalDictionary.get(column); + TreeMap<Object, DerivedValue> sortedMap = origAndDerivedValueHolder._origAndDerivedValues; + int length = origMultiValues.length; + Object[] derivedMultiValues = new Object[length]; + for (int i = 0; i < length; i++) { + derivedMultiValues[i] = sortedMap.get(origMultiValues[i]); + } + return derivedMultiValues; + } + + private static class OrigAndDerivedValueHolder { + FieldSpec.DataType _dataType; + TreeMap<Object, DerivedValue> _origAndDerivedValues; + + OrigAndDerivedValueHolder(FieldSpec.DataType dataType) { + _dataType = dataType; + _origAndDerivedValues = new TreeMap<>(); + } + + void setOrigValue(Object origValue) { + if (!_origAndDerivedValues.containsKey(origValue)) { + _origAndDerivedValues.put(origValue, new DerivedValue()); + } + } + } + + private static class DerivedValue { Review comment: This wrapper is needed so that we can set the derived values in global dictionary in sorted order while iterating over it ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
