This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new a257935  Support no global-dictionary columns in data anonymizer 
(#5071)
a257935 is described below

commit a257935d9a015a82b2dc54cc3a7887740cb67440
Author: Sidd <siddharthteo...@gmail.com>
AuthorDate: Fri Feb 14 17:20:37 2020 -0800

    Support no global-dictionary columns in data anonymizer (#5071)
    
    * Support no global-dictionary colums in data anonymizer
    
    * address review comments
---
 .../anonymizer/PinotDataAndQueryAnonymizer.java    | 52 +++++++++++++++-------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git 
a/pinot-tools/src/main/java/org/apache/pinot/tools/anonymizer/PinotDataAndQueryAnonymizer.java
 
b/pinot-tools/src/main/java/org/apache/pinot/tools/anonymizer/PinotDataAndQueryAnonymizer.java
index 891f428..28b5546 100644
--- 
a/pinot-tools/src/main/java/org/apache/pinot/tools/anonymizer/PinotDataAndQueryAnonymizer.java
+++ 
b/pinot-tools/src/main/java/org/apache/pinot/tools/anonymizer/PinotDataAndQueryAnonymizer.java
@@ -45,6 +45,7 @@ import org.apache.avro.file.DataFileWriter;
 import org.apache.avro.generic.GenericData;
 import org.apache.avro.generic.GenericDatumWriter;
 import org.apache.commons.lang.RandomStringUtils;
+import org.apache.pinot.common.segment.SegmentMetadata;
 import org.apache.pinot.pql.parsers.pql2.ast.OrderByAstNode;
 import org.apache.pinot.pql.parsers.pql2.ast.OrderByExpressionAstNode;
 import org.apache.pinot.spi.data.DateTimeFieldSpec;
@@ -257,17 +258,20 @@ public class PinotDataAndQueryAnonymizer {
    */
 
   public void buildGlobalDictionaries() throws Exception {
+    File segmentParentDirectory = new File(_segmentDir);
+    _segmentDirectories = segmentParentDirectory.list();
+    _numFilesToGenerate = _segmentDirectories.length;
+    LOGGER.info("Total number of segments: " + _numFilesToGenerate);
+
     if (_globalDictionaryColumns.isEmpty()) {
-      LOGGER.info("Set of global dictionary columns is empty");
+      LOGGER.info("Set of global dictionary columns is empty. Not building 
global dictionaries");
+      getSchemaFromFirstSegment(_segmentDir +"/" + _segmentDirectories[0]);
+      writeColumnMapping();
       return;
     }
 
     _timeToBuildDictionaries.start();
 
-    File segmentParentDirectory = new File(_segmentDir);
-    _segmentDirectories = segmentParentDirectory.list();
-    _numFilesToGenerate = _segmentDirectories.length;
-
     // STEP 1 for building global dictionary
     for (String segmentDirectory : _segmentDirectories) {
       readDictionariesFromSegment(_segmentDir + "/" + segmentDirectory);
@@ -287,22 +291,34 @@ public class PinotDataAndQueryAnonymizer {
     LOGGER.info("Finished building global dictionaries. Time taken: {}secs", 
_timeToBuildDictionaries.elapsed(TimeUnit.SECONDS));
   }
 
-
-  /**
-   * Read dictionaries from a single segment
-   * @param segmentDirectory segment index directory
-   * @throws Exception
-   */
-  private void readDictionariesFromSegment(String segmentDirectory) throws 
Exception {
+  private void getSchemaFromFirstSegment(String segmentDirectory) throws 
Exception {
+    LOGGER.info("Reading metadata from segment: " + segmentDirectory);
     File segmentIndexDir = new File(segmentDirectory);
     SegmentMetadataImpl segmentMetadata = new 
SegmentMetadataImpl(segmentIndexDir);
+    pinotToAvroSchema(segmentMetadata);
+  }
 
+  private void pinotToAvroSchema(SegmentMetadata segmentMetadata) {
     if (_pinotSchema == null) {
       // only do this for first segment
       _pinotSchema = segmentMetadata.getSchema();
       anonymizeColumnNames(_pinotSchema);
       _avroSchema = getAvroSchemaFromPinotSchema(_pinotSchema);
+      LOGGER.info("Pinot schema: " + _pinotSchema.toPrettyJsonString());
+      LOGGER.info("Avro schema: " + _avroSchema.toString(true));
     }
+  }
+
+
+  /**
+   * Read dictionaries from a single segment
+   * @param segmentDirectory segment index directory
+   * @throws Exception
+   */
+  private void readDictionariesFromSegment(String segmentDirectory) throws 
Exception {
+    File segmentIndexDir = new File(segmentDirectory);
+    SegmentMetadataImpl segmentMetadata = new 
SegmentMetadataImpl(segmentIndexDir);
+    pinotToAvroSchema(segmentMetadata);
 
     // read dictionaries from segment and build equivalent dictionary of 
random values
     ImmutableSegment immutableSegment = 
ImmutableSegmentLoader.load(segmentIndexDir, ReadMode.mmap);
@@ -339,6 +355,14 @@ public class PinotDataAndQueryAnonymizer {
     // write column name mapping
     Stopwatch stopwatch = Stopwatch.createUnstarted();
     stopwatch.start();
+    writeColumnMapping();
+    _globalDictionaries.serialize(_outputDir);
+    stopwatch.stop();
+    LOGGER.info("Finished writing global dictionaries and column name mapping 
to disk. Time taken: {}secs. Please see the files in {}",
+        stopwatch.elapsed(TimeUnit.SECONDS), _outputDir);
+  }
+
+  private void writeColumnMapping() throws Exception {
     PrintWriter columnMappingWriter = new PrintWriter(new BufferedWriter(new 
FileWriter(_outputDir + "/" + COLUMN_MAPPING_FILE_KEY)));
     for (Map.Entry<String, String> entry : 
_origToDerivedColumnsMap.entrySet()) {
       String columnName = entry.getKey();
@@ -346,10 +370,6 @@ public class PinotDataAndQueryAnonymizer {
       columnMappingWriter.println(columnName + COLUMN_MAPPING_SEPARATOR + 
derivedColumnName);
     }
     columnMappingWriter.flush();
-    _globalDictionaries.serialize(_outputDir);
-    stopwatch.stop();
-    LOGGER.info("Finished writing global dictionaries and column name mapping 
to disk. Time taken: {}secs. Please see the files in {}",
-        stopwatch.elapsed(TimeUnit.SECONDS), _outputDir);
   }
 
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to