sajjad-moradi commented on a change in pull request #6546:
URL: https://github.com/apache/incubator-pinot/pull/6546#discussion_r577250920
##########
File path:
pinot-tools/src/main/java/org/apache/pinot/tools/realtime/provisioning/MemoryEstimator.java
##########
@@ -387,4 +418,182 @@ private long
calculateMemoryForCompletedSegmentsPerPartition(long completedSegme
public String[][] getNumSegmentsQueriedPerHost() {
return _numSegmentsQueriedPerHost;
}
+
+ private static File generateCompletedSegment(File dataCharacteristicsFile,
File schemaFile, TableConfig tableConfig) {
+ SegmentGenerator segmentGenerator = new
SegmentGenerator(dataCharacteristicsFile, schemaFile, tableConfig, true);
+ return segmentGenerator.generate();
+ }
+
+ /**
+ * This class is used in Memory Estimator to generate segment based on the
the given characteristics of data
+ */
+ public static class SegmentGenerator {
+ private static final Logger LOGGER =
LoggerFactory.getLogger(SegmentGenerator.class);
+ private static final SimpleDateFormat DATE_FORMAT = new
SimpleDateFormat("yyyy-MM-dd_HH:mm:ss");
+
+ private File _dataCharacteristicsFile;
+ private File _schemaFile;
+ private TableConfig _tableConfig;
+ private final boolean _deleteCsv;
+
+ public SegmentGenerator(File dataCharacteristicsFile, File schemaFile,
TableConfig tableConfig, boolean deleteCsv) {
+ _dataCharacteristicsFile = dataCharacteristicsFile;
+ _schemaFile = schemaFile;
+ _tableConfig = tableConfig;
+ _deleteCsv = deleteCsv;
+ }
+
+ public File generate() {
+ Date now = new Date();
+
+ // extract schema
+ Schema schema;
+ try {
+ schema = JsonUtils.fileToObject(_schemaFile, Schema.class);
+ } catch (Exception e) {
+ throw new RuntimeException(String.format("Cannot read schema file '%s'
to schema object.", _schemaFile));
+ }
+
+ // generate data & creat segment
+ File csvDataFile = generateData(schema, now);
+ File segment = createSegment(csvDataFile, schema, now);
+
+ if (_deleteCsv) {
+ csvDataFile.delete();
+ }
+ return segment;
+ }
+
+ private File generateData(Schema schema, Date now) {
+
+ // extract data characteristics
+ DataCharacteristics dataCharacteristics;
+ try {
+ dataCharacteristics =
JsonUtils.fileToObject(_dataCharacteristicsFile, DataCharacteristics.class);
+ } catch (Exception e) {
+ throw new RuntimeException("Cannot deserialize data characteristic
file " + _dataCharacteristicsFile);
+ }
+
+ // create maps of "column name" to ...
+ Map<String, Integer> lengths = new HashMap<>();
+ Map<String, Double> mvCounts = new HashMap<>();
+ Map<String, Integer> cardinalities = new HashMap<>();
+ Map<String, FieldSpec.DataType> dataTypes = new HashMap<>();
+ Map<String, FieldSpec.FieldType> fieldTypes = new HashMap<>();
+ Map<String, TimeUnit> timeUnits = new HashMap<>();
+ List<String> colNames = new ArrayList<>();
+ dataCharacteristics.columnCharacteristics.forEach(column -> {
+ colNames.add(column.name);
+ lengths.put(column.name, column.averageLength);
+ mvCounts.put(column.name, column.numberOfValuesPerEntry);
+ cardinalities.put(column.name, column.cardinality);
+ });
+ schema.getAllFieldSpecs().forEach(fieldSpec -> {
+ String name = fieldSpec.getName();
+ dataTypes.put(name, fieldSpec.getDataType());
+ fieldTypes.put(name, fieldSpec.getFieldType());
+ });
+ timeUnits.put(schema.getTimeFieldSpec().getName(),
+
schema.getTimeFieldSpec().getIncomingGranularitySpec().getTimeType());
+
+ // generate data
+ String outputDir = getOutputDir(now, "-csv");
+ DataGeneratorSpec spec =
+ new DataGeneratorSpec(colNames, cardinalities, new HashMap<>(), new
HashMap<>(), mvCounts, lengths, dataTypes,
+ fieldTypes, timeUnits, FileFormat.CSV, outputDir, true);
+ DataGenerator dataGenerator = new DataGenerator();
+ try {
+ dataGenerator.init(spec);
+ dataGenerator.generateCsv(dataCharacteristics.numberOfRows, 1);
+ File outputFile = Paths.get(outputDir, "output_0.csv").toFile();
+ LOGGER.info("Successfully generated data file: {}", outputFile);
+ return outputFile;
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private File createSegment(File csvDataFile, Schema schema, Date now) {
+
+ // create segment
+ LOGGER.info("Started creating segment from file: {}", csvDataFile);
+ String outDir = getOutputDir(now, "-segment");
+ SegmentGeneratorConfig segmentGeneratorConfig =
getSegmentGeneratorConfig(csvDataFile, schema, outDir);
+ SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl();
+ try {
+ driver.init(segmentGeneratorConfig);
+ driver.build();
+ } catch (Exception e) {
+ throw new RuntimeException("Caught exception while generating segment
from file: " + csvDataFile, e);
+ }
+ String segmentName = driver.getSegmentName();
+ File indexDir = new File(outDir, segmentName);
+ LOGGER.info("Successfully created segment: {} at directory: {}",
segmentName, indexDir);
+
+ // verify segment
+ LOGGER.info("Verifying the segment by loading it");
+ ImmutableSegment segment;
+ try {
+ segment = ImmutableSegmentLoader.load(indexDir, ReadMode.mmap);
+ } catch (Exception e) {
+ throw new RuntimeException("Caught exception while verifying the
created segment", e);
+ }
+ LOGGER.info("Successfully loaded segment: {} of size: {} bytes",
segmentName,
+ segment.getSegmentSizeBytes());
+ segment.destroy();
+
+ return indexDir;
+ }
+
+ private SegmentGeneratorConfig getSegmentGeneratorConfig(File csvDataFile,
Schema schema, String outDir) {
+ SegmentGeneratorConfig segmentGeneratorConfig = new
SegmentGeneratorConfig(_tableConfig, schema);
+ segmentGeneratorConfig.setInputFilePath(csvDataFile.getPath());
+ segmentGeneratorConfig.setFormat(FileFormat.CSV);
+ segmentGeneratorConfig.setOutDir(outDir);
+ segmentGeneratorConfig.setReaderConfig(new CSVRecordReaderConfig()); //
FIXME
Review comment:
It was for my book-keeping that got away to get cleaned up before
sending the PR. It's removed now.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]