This is an automated email from the ASF dual-hosted git repository.
siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git
The following commit(s) were added to refs/heads/master by this push:
new a257935 Support no global-dictionary columns in data anonymizer
(#5071)
a257935 is described below
commit a257935d9a015a82b2dc54cc3a7887740cb67440
Author: Sidd <[email protected]>
AuthorDate: Fri Feb 14 17:20:37 2020 -0800
Support no global-dictionary columns in data anonymizer (#5071)
* Support no global-dictionary colums in data anonymizer
* address review comments
---
.../anonymizer/PinotDataAndQueryAnonymizer.java | 52 +++++++++++++++-------
1 file changed, 36 insertions(+), 16 deletions(-)
diff --git
a/pinot-tools/src/main/java/org/apache/pinot/tools/anonymizer/PinotDataAndQueryAnonymizer.java
b/pinot-tools/src/main/java/org/apache/pinot/tools/anonymizer/PinotDataAndQueryAnonymizer.java
index 891f428..28b5546 100644
---
a/pinot-tools/src/main/java/org/apache/pinot/tools/anonymizer/PinotDataAndQueryAnonymizer.java
+++
b/pinot-tools/src/main/java/org/apache/pinot/tools/anonymizer/PinotDataAndQueryAnonymizer.java
@@ -45,6 +45,7 @@ import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.commons.lang.RandomStringUtils;
+import org.apache.pinot.common.segment.SegmentMetadata;
import org.apache.pinot.pql.parsers.pql2.ast.OrderByAstNode;
import org.apache.pinot.pql.parsers.pql2.ast.OrderByExpressionAstNode;
import org.apache.pinot.spi.data.DateTimeFieldSpec;
@@ -257,17 +258,20 @@ public class PinotDataAndQueryAnonymizer {
*/
public void buildGlobalDictionaries() throws Exception {
+ File segmentParentDirectory = new File(_segmentDir);
+ _segmentDirectories = segmentParentDirectory.list();
+ _numFilesToGenerate = _segmentDirectories.length;
+ LOGGER.info("Total number of segments: " + _numFilesToGenerate);
+
if (_globalDictionaryColumns.isEmpty()) {
- LOGGER.info("Set of global dictionary columns is empty");
+ LOGGER.info("Set of global dictionary columns is empty. Not building
global dictionaries");
+ getSchemaFromFirstSegment(_segmentDir +"/" + _segmentDirectories[0]);
+ writeColumnMapping();
return;
}
_timeToBuildDictionaries.start();
- File segmentParentDirectory = new File(_segmentDir);
- _segmentDirectories = segmentParentDirectory.list();
- _numFilesToGenerate = _segmentDirectories.length;
-
// STEP 1 for building global dictionary
for (String segmentDirectory : _segmentDirectories) {
readDictionariesFromSegment(_segmentDir + "/" + segmentDirectory);
@@ -287,22 +291,34 @@ public class PinotDataAndQueryAnonymizer {
LOGGER.info("Finished building global dictionaries. Time taken: {}secs",
_timeToBuildDictionaries.elapsed(TimeUnit.SECONDS));
}
-
- /**
- * Read dictionaries from a single segment
- * @param segmentDirectory segment index directory
- * @throws Exception
- */
- private void readDictionariesFromSegment(String segmentDirectory) throws
Exception {
+ private void getSchemaFromFirstSegment(String segmentDirectory) throws
Exception {
+ LOGGER.info("Reading metadata from segment: " + segmentDirectory);
File segmentIndexDir = new File(segmentDirectory);
SegmentMetadataImpl segmentMetadata = new
SegmentMetadataImpl(segmentIndexDir);
+ pinotToAvroSchema(segmentMetadata);
+ }
+ private void pinotToAvroSchema(SegmentMetadata segmentMetadata) {
if (_pinotSchema == null) {
// only do this for first segment
_pinotSchema = segmentMetadata.getSchema();
anonymizeColumnNames(_pinotSchema);
_avroSchema = getAvroSchemaFromPinotSchema(_pinotSchema);
+ LOGGER.info("Pinot schema: " + _pinotSchema.toPrettyJsonString());
+ LOGGER.info("Avro schema: " + _avroSchema.toString(true));
}
+ }
+
+
+ /**
+ * Read dictionaries from a single segment
+ * @param segmentDirectory segment index directory
+ * @throws Exception
+ */
+ private void readDictionariesFromSegment(String segmentDirectory) throws
Exception {
+ File segmentIndexDir = new File(segmentDirectory);
+ SegmentMetadataImpl segmentMetadata = new
SegmentMetadataImpl(segmentIndexDir);
+ pinotToAvroSchema(segmentMetadata);
// read dictionaries from segment and build equivalent dictionary of
random values
ImmutableSegment immutableSegment =
ImmutableSegmentLoader.load(segmentIndexDir, ReadMode.mmap);
@@ -339,6 +355,14 @@ public class PinotDataAndQueryAnonymizer {
// write column name mapping
Stopwatch stopwatch = Stopwatch.createUnstarted();
stopwatch.start();
+ writeColumnMapping();
+ _globalDictionaries.serialize(_outputDir);
+ stopwatch.stop();
+ LOGGER.info("Finished writing global dictionaries and column name mapping
to disk. Time taken: {}secs. Please see the files in {}",
+ stopwatch.elapsed(TimeUnit.SECONDS), _outputDir);
+ }
+
+ private void writeColumnMapping() throws Exception {
PrintWriter columnMappingWriter = new PrintWriter(new BufferedWriter(new
FileWriter(_outputDir + "/" + COLUMN_MAPPING_FILE_KEY)));
for (Map.Entry<String, String> entry :
_origToDerivedColumnsMap.entrySet()) {
String columnName = entry.getKey();
@@ -346,10 +370,6 @@ public class PinotDataAndQueryAnonymizer {
columnMappingWriter.println(columnName + COLUMN_MAPPING_SEPARATOR +
derivedColumnName);
}
columnMappingWriter.flush();
- _globalDictionaries.serialize(_outputDir);
- stopwatch.stop();
- LOGGER.info("Finished writing global dictionaries and column name mapping
to disk. Time taken: {}secs. Please see the files in {}",
- stopwatch.elapsed(TimeUnit.SECONDS), _outputDir);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]