siddharthteotia commented on a change in pull request #4747: Data Anonymizer 
Tool
URL: https://github.com/apache/incubator-pinot/pull/4747#discussion_r342816578
 
 

 ##########
 File path: 
pinot-tools/src/main/java/org/apache/pinot/tools/PinotDataAndQueryAnonymizer.java
 ##########
 @@ -0,0 +1,1332 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.tools;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Stopwatch;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import org.apache.avro.SchemaBuilder;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.commons.lang.RandomStringUtils;
+import org.apache.pinot.common.data.DateTimeFieldSpec;
+import org.apache.pinot.common.data.DimensionFieldSpec;
+import org.apache.pinot.common.data.FieldSpec;
+import org.apache.pinot.common.data.MetricFieldSpec;
+import org.apache.pinot.common.data.Schema;
+import org.apache.pinot.common.data.TimeFieldSpec;
+import org.apache.pinot.common.segment.ReadMode;
+import org.apache.pinot.core.data.GenericRow;
+import org.apache.pinot.core.data.readers.PinotSegmentRecordReader;
+import org.apache.pinot.core.indexsegment.immutable.ImmutableSegment;
+import org.apache.pinot.core.indexsegment.immutable.ImmutableSegmentLoader;
+import org.apache.pinot.core.segment.index.ColumnMetadata;
+import org.apache.pinot.core.segment.index.SegmentMetadataImpl;
+import org.apache.pinot.core.segment.index.readers.Dictionary;
+import org.apache.pinot.pql.parsers.Pql2Compiler;
+import org.apache.pinot.pql.parsers.pql2.ast.AstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.BetweenPredicateAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.BooleanOperatorAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.ComparisonPredicateAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.FunctionCallAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.GroupByAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.IdentifierAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.InPredicateAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.LiteralAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.OutputColumnAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.OutputColumnListAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.PredicateAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.PredicateListAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.SelectAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.StarColumnListAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.StarExpressionAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.StringLiteralAstNode;
+import org.apache.pinot.pql.parsers.pql2.ast.WhereAstNode;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * The goal of this tool is to generate test dataset (as Avro files) with
+ * characteristics similar to a given source dataset. The source dataset is
+ * a set of Pinot segments. The tool can be used in situations where actual
+ * source data isn't allowed to be used for the purpose of testing (regression,
+ * performance, functional, evaluation of other OLAP systems etc).
+ *
+ * The tool understands the characteristics of the given dataset (Pinot 
segments)
+ * and generate corresponding random data while preserving those 
characteristics.
+ * The tool can then also be used to generate queries for the random data.
+ *
+ * So if we have a set of production data which you want to use for testing
+ * but are unable to do so (because of security restrictions etc), then this 
tool
+ * can be used to generate corresponding anonymous data and queries. Users can 
then
+ * use the anonymized dataset (avro files) and generated queries for their 
testing.
+ *
+ * One avro file is generated per input Pinot segment. The tool also 
randomizes the
+ * column names (and table name) so that source schema is not revealed. The 
user is also
+ * allowed to provide a set of columns for which they want the data to be 
retained
+ * as is (not anonymized). User should be careful when choosing these columns. 
Ideally
+ * these should be time (or time related) columns since they don't reveal 
anything and so
+ * it is fine to copy them as is from souce segments into Avro files.
+ *
+ * Steps to use this tool are as follows:
+ *
+ * STEP 1 - Download a day’s queries (same day when we downloaded the 
segments) from
+ * Pinot broker query log. You may have to post-process the log to remove some 
noise
+ * and just keep queries
 
 Review comment:
   Sure. I have removed them and will add to pinot readTheDocs in a follow-up 
PR. I have kept the introduction to the tool though.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to