mcvsubbu commented on a change in pull request #4747: Data Anonymizer Tool URL: https://github.com/apache/incubator-pinot/pull/4747#discussion_r340759838
########## File path: pinot-tools/src/main/java/org/apache/pinot/tools/admin/command/AnonymizeDataCommand.java ########## @@ -0,0 +1,204 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.tools.admin.command; + +import com.google.common.collect.Lists; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import org.apache.pinot.tools.Command; +import org.apache.pinot.tools.PinotDataAndQueryAnonymizer; +import org.kohsuke.args4j.Option; +import org.kohsuke.args4j.spi.StringArrayOptionHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +@SuppressWarnings({"FieldCanBeLocal", "unused"}) +public class AnonymizeDataCommand extends AbstractBaseAdminCommand implements Command { + private static final Logger LOGGER = LoggerFactory.getLogger(AnonymizeDataCommand.class); + + @Option(name = "-inputSegmentsDir", metaVar = "<String>", usage = "Absolute path of directory containing Pinot table segments") + private String _inputSegmentsDir; + + @Option(name = "-outputDir", metaVar = "<String>", usage = "Absolute path of directory where generated Avro files and global dictionaries will be written into") + private String _outputDir; + + @Option(name = "-avroFileNamePrefix", metaVar = "<String>", usage = "Generated Avro file name prefix") + private String _avroFileNamePrefix; + + @Option(name = "-generateData", metaVar = "<boolean>", usage = "Should the tool generate data(false by default)") + private boolean _generateData = false; + + @Option(name = "-generateQueries", metaVar = "<boolean>", usage = "Should the tool generate queries(false by default)") + private boolean _generateQueries = false; + + @Option(name = "-tableName", metaVar = "<String>", usage = "Table name to use for generating queries") + private String _tableName; + + @Option(name = "-queryDir", metaVar = "<String>", usage = "Absolute path of directory containing the source query file and where the generated query file will be written into") + private String _queryDir; + + @Option(name = "-queryFileName", metaVar = "<String>", usage = "Query file name in queryDir") + private String _queryFileName; + + @Option(name = "-columnsToRetainDataFor", handler = StringArrayOptionHandler.class, usage = "Set of columns to retain data for (empty by default). These should generally be time columns") + private String[] _columnsToRetainDataFor; + + @Option(name = "-extractFilterColumns", metaVar = "<boolean>", usage = "Should the tool first extract filter columns (false by default)") + private boolean _extractFilterColumns = false; + + @Option(name = "-filterColumns", handler = StringArrayOptionHandler.class, usage = "Set of filter columns to build global dictionaries for") + private String[] _columnsParticipatingInFilter; + + @Option(name = "-filterColumnCardinality", handler = StringArrayOptionHandler.class, usage = "filter column cardinalities") + private String[] _filterColumnCardinalities; + + @Option(name = "-help", help = true, aliases = {"-h", "--h", "--help"}, usage = "Print this message") + private boolean _help = false; + + public boolean getHelp() { + return _help; + } + + @Override + public String getName() { + return "AnonymizeData"; + } + + @Override + public boolean execute() + throws Exception { + if (_extractFilterColumns) { + Set<String> filterColumns = PinotDataAndQueryAnonymizer.FilterColumnExtractor.extractColumnsUsedInFilter(_queryDir, _queryFileName); + System.out.println("Columns participating in filter"); + for (String column: filterColumns) { + System.out.println(column); + } + // if the user has asked for extracting filter columns from a query file, then + // we should simply return after doing that since the tool will be run subsequently + // based on the set of filter columns it returns to the user + return true; + } + + Set<String> columnsToRetainDataFor = new HashSet<>(); + if (_columnsToRetainDataFor != null) { + columnsToRetainDataFor.addAll(Lists.newArrayList(_columnsToRetainDataFor)); + } + + Set<String> filterColumns = new HashSet<>(); + if (_columnsParticipatingInFilter != null) { + filterColumns.addAll(Lists.newArrayList(_columnsParticipatingInFilter)); + } + + if (_generateData) { + // It is fine to not specify any set of filter columns (and cardinalities). + // In such case, data generation phase will skip building global dictionaries + // and simply generate random data. Later during query generation, we will assert + // if we encounter a WHERE clause. So user should choose to not specify the set of + // filter columns only if they are interested in generating any arbitrary data + // without taking care of input Pinot segment distribution/cardinality and they + // won't be generating queries later. + // + // However, we check for the condition that if user has specified a set of + // filter columns, the corresponding cardinalities are also there in equal number + if ((_columnsParticipatingInFilter != null && _filterColumnCardinalities == null) || + (_filterColumnCardinalities != null && _columnsParticipatingInFilter == null) || + (_columnsParticipatingInFilter != null && _columnsParticipatingInFilter.length != _filterColumnCardinalities.length)) { + throw new RuntimeException("Please correctly specify the set of filter columns and their corresponding cardinality values"); + } + + Map<String, Integer> filterColumnCardinalityMap = new HashMap<>(); + + if (_columnsParticipatingInFilter != null) { + for (int i = 0; i < _columnsParticipatingInFilter.length; i++) { + String filterColumn = _columnsParticipatingInFilter[i]; + int filterColumnCardinality = Integer.valueOf(_filterColumnCardinalities[i]); + filterColumnCardinalityMap.put(filterColumn, filterColumnCardinality); + } + } + + // generate data + PinotDataAndQueryAnonymizer pinotDataGenerator = new PinotDataAndQueryAnonymizer( + _inputSegmentsDir, + _outputDir, + _avroFileNamePrefix, + filterColumnCardinalityMap, + columnsToRetainDataFor); + // first build global dictionaries + pinotDataGenerator.buildGlobalDictionaries(); + // use global dictionaries to generate Avro files + pinotDataGenerator.generateAvroFiles(); + + return true; + } + + if (_generateQueries) { + // generate queries from prebuilt global dictionaries + PinotDataAndQueryAnonymizer.QueryGenerator queryGenerator = new PinotDataAndQueryAnonymizer.QueryGenerator( + _outputDir, _queryDir, _queryFileName, _tableName, filterColumns, columnsToRetainDataFor); + queryGenerator.generateQueries(); + + return true; + } + + throw new RuntimeException( + "One of the options (-extractFilterColumns, -generateDataa, -generateQueries should be true. Please use the -help option to see usage examples"); + } + + @Override + public String description() { + return "Data anonymizer"; Review comment: Please add a one-line description here ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
