[jira] [Commented] (DRILL-7271) Refactor Metadata interfaces and classes to contain all needed information for the File based Metastore

ASF GitHub Bot (JIRA) Fri, 21 Jun 2019 06:46:29 -0700


    [ 
https://issues.apache.org/jira/browse/DRILL-7271?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16869487#comment-16869487
 ]


ASF GitHub Bot commented on DRILL-7271:
---------------------------------------

ihuzenko commented on pull request #1810: DRILL-7271: Refactor Metadata 
interfaces and classes to contain all needed information for the File based 
Metastore
URL: https://github.com/apache/drill/pull/1810#discussion_r296229552
 
 

 ##########
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetTableMetadataUtils.java
 ##########
 @@ -148,112 +142,71 @@ private ParquetTableMetadataUtils() {
   public static RowGroupMetadata 
getRowGroupMetadata(MetadataBase.ParquetTableMetadataBase tableMetadata,
       MetadataBase.RowGroupMetadata rowGroupMetadata, int rgIndexInFile, Path 
location) {
     Map<SchemaPath, ColumnStatistics> columnsStatistics = 
getRowGroupColumnStatistics(tableMetadata, rowGroupMetadata);
-    Map<StatisticsKind, Object> rowGroupStatistics = new HashMap<>();
-    rowGroupStatistics.put(TableStatisticsKind.ROW_COUNT, 
rowGroupMetadata.getRowCount());
-    rowGroupStatistics.put(() -> ExactStatisticsConstants.START, 
rowGroupMetadata.getStart());
-    rowGroupStatistics.put(() -> ExactStatisticsConstants.LENGTH, 
rowGroupMetadata.getLength());
+    List<StatisticsHolder> rowGroupStatistics = new ArrayList<>();
+    rowGroupStatistics.add(new 
StatisticsHolder<>(rowGroupMetadata.getRowCount(), 
TableStatisticsKind.ROW_COUNT));
+    rowGroupStatistics.add(new StatisticsHolder<>(rowGroupMetadata.getStart(), 
new BaseStatisticsKind(ExactStatisticsConstants.START, true)));
+    rowGroupStatistics.add(new 
StatisticsHolder<>(rowGroupMetadata.getLength(), new 
BaseStatisticsKind(ExactStatisticsConstants.LENGTH, true)));
 
     Map<SchemaPath, TypeProtos.MajorType> columns = 
getRowGroupFields(tableMetadata, rowGroupMetadata);
 
     TupleSchema schema = new TupleSchema();
     columns.forEach((schemaPath, majorType) -> 
MetadataUtils.addColumnMetadata(schema, schemaPath, majorType));
 
-    return new RowGroupMetadata(
-        schema, columnsStatistics, rowGroupStatistics, 
rowGroupMetadata.getHostAffinity(), rgIndexInFile, location);
-  }
+    MetadataInfo metadataInfo = new MetadataInfo(MetadataType.ROW_GROUP, 
MetadataInfo.GENERAL_INFO_KEY, null);
 
-  /**
-   * Merges list of specified metadata into the map of {@link 
ColumnStatistics} with columns as keys.
-   *
-   * @param <T>                 type of metadata to collect
-   * @param metadataList        list of metadata to be merged
-   * @param columns             set of columns whose statistics should be 
merged
-   * @param statisticsToCollect kinds of statistics that should be collected
-   * @param parquetTableMetadata ParquetTableMetadata object to fetch the 
non-interesting columns
-   * @return list of merged metadata
-   */
-  @SuppressWarnings("unchecked")
-  public static <T extends BaseMetadata> Map<SchemaPath, ColumnStatistics> 
mergeColumnsStatistics(
-          Collection<T> metadataList, Set<SchemaPath> columns, 
List<CollectableColumnStatisticsKind> statisticsToCollect, 
MetadataBase.ParquetTableMetadataBase parquetTableMetadata) {
-    Map<SchemaPath, ColumnStatistics> columnsStatistics = new HashMap<>();
-
-    for (SchemaPath column : columns) {
-      List<ColumnStatistics> statisticsList = new ArrayList<>();
-      for (T metadata : metadataList) {
-        ColumnStatistics statistics = 
metadata.getColumnsStatistics().get(column);
-        if (statistics == null) {
-          // schema change happened, set statistics which represents all nulls
-          statistics = new ColumnStatisticsImpl(
-              ImmutableMap.of(ColumnStatisticsKind.NULLS_COUNT, 
metadata.getStatistic(TableStatisticsKind.ROW_COUNT)),
-              getNaturalNullsFirstComparator());
-        }
-        statisticsList.add(statistics);
-      }
-      Map<StatisticsKind, Object> statisticsMap = new HashMap<>();
-      for (CollectableColumnStatisticsKind statisticsKind : 
statisticsToCollect) {
-        Object mergedStatistic = 
statisticsKind.mergeStatistics(statisticsList);
-        statisticsMap.put(statisticsKind, mergedStatistic);
-      }
-      columnsStatistics.put(column, new ColumnStatisticsImpl(statisticsMap, 
statisticsList.iterator().next().getValueComparator()));
-    }
-    return columnsStatistics;
+    return new RowGroupMetadata(TableInfo.UNKNOWN_TABLE_INFO, metadataInfo,
+        schema, columnsStatistics, rowGroupStatistics, 
rowGroupMetadata.getHostAffinity(), rgIndexInFile, location);
   }
 
   /**
    * Returns {@link FileMetadata} instance received by merging specified 
{@link RowGroupMetadata} list.
    *
    * @param rowGroups list of {@link RowGroupMetadata} to be merged
-   * @param tableName name of the table
-   * @param parquetTableMetadata the source of column metadata for 
non-interesting column's statistics
    * @return {@link FileMetadata} instance
    */
-  public static FileMetadata getFileMetadata(List<RowGroupMetadata> rowGroups, 
String tableName,
-      MetadataBase.ParquetTableMetadataBase parquetTableMetadata) {
+  public static FileMetadata getFileMetadata(List<RowGroupMetadata> rowGroups) 
{
     if (rowGroups.isEmpty()) {
       return null;
     }
-    Map<StatisticsKind, Object> fileStatistics = new HashMap<>();
-    fileStatistics.put(TableStatisticsKind.ROW_COUNT, 
TableStatisticsKind.ROW_COUNT.mergeStatistics(rowGroups));
+    List<StatisticsHolder> fileStatistics = new ArrayList<>();
+    fileStatistics.add(new 
StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.mergeStatistics(rowGroups), 
TableStatisticsKind.ROW_COUNT));
+
+    RowGroupMetadata rowGroupMetadata = rowGroups.iterator().next();
+    TupleMetadata schema = rowGroupMetadata.getSchema();
+
+    Set<SchemaPath> columns = rowGroupMetadata.getColumnsStatistics().keySet();
 
-    TupleMetadata schema = rowGroups.iterator().next().getSchema();
+    MetadataInfo metadataInfo = new MetadataInfo(MetadataType.FILE, 
MetadataInfo.GENERAL_INFO_KEY, null);
 
-    return new FileMetadata(rowGroups.iterator().next().getLocation(), schema,
-      mergeColumnsStatistics(rowGroups, 
rowGroups.iterator().next().getColumnsStatistics().keySet(), 
PARQUET_STATISTICS, parquetTableMetadata),
-      fileStatistics, tableName, -1);
+    return new FileMetadata(rowGroupMetadata.getTableInfo(), metadataInfo, 
rowGroupMetadata.getPath(), schema,
+        TableMetadataUtils.mergeColumnsStatistics(rowGroups, columns, 
PARQUET_COLUMN_STATISTICS),
+        fileStatistics, BaseTableMetadata.NON_DEFINED_LAST_MODIFIED_TIME);
   }
 
   /**
    * Returns {@link PartitionMetadata} instance received by merging specified 
{@link FileMetadata} list.
    *
    * @param partitionColumn partition column
    * @param files           list of files to be merged
-   * @param tableName       name of the table
    * @return {@link PartitionMetadata} instance
    */
-  public static PartitionMetadata getPartitionMetadata(SchemaPath 
partitionColumn, List<FileMetadata> files, String tableName) {
+  public static PartitionMetadata getPartitionMetadata(SchemaPath 
partitionColumn, List<FileMetadata> files) {
     Set<Path> locations = new HashSet<>();
     Set<SchemaPath> columns = new HashSet<>();
 
     for (FileMetadata file : files) {
       columns.addAll(file.getColumnsStatistics().keySet());
-      locations.add(file.getLocation());
+      locations.add(file.getPath());
     }
 
-    Map<StatisticsKind, Object> partStatistics = new HashMap<>();
-    partStatistics.put(TableStatisticsKind.ROW_COUNT, 
TableStatisticsKind.ROW_COUNT.mergeStatistics(files));
+    FileMetadata fileMetadata = files.iterator().next();
 
-    return new PartitionMetadata(partitionColumn, 
files.iterator().next().getSchema(),
-        mergeColumnsStatistics(files, columns, PARQUET_STATISTICS, null), 
partStatistics, locations, tableName, -1);
-  }
+    MetadataInfo metadataInfo = new MetadataInfo(MetadataType.PARTITION, 
MetadataInfo.GENERAL_INFO_KEY, null);
 
-  /**
-   * Returns "natural order" comparator which threads nulls as min values.
-   *
-   * @param <T> type to compare
-   * @return "natural order" comparator
-   */
-  public static <T extends Comparable<T>> Comparator<T> 
getNaturalNullsFirstComparator() {
-    return Comparator.nullsFirst(Comparator.naturalOrder());
+    return new PartitionMetadata(fileMetadata.getTableInfo(), metadataInfo, 
partitionColumn, fileMetadata.getSchema(),
 
 Review comment:
   maybe use builder here ?
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Refactor Metadata interfaces and classes to contain all needed information 
> for the File based Metastore
> -------------------------------------------------------------------------------------------------------
>
>                 Key: DRILL-7271
>                 URL: https://issues.apache.org/jira/browse/DRILL-7271
>             Project: Apache Drill
>          Issue Type: Sub-task
>            Reporter: Arina Ielchiieva
>            Assignee: Volodymyr Vysotskyi
>            Priority: Major
>             Fix For: 1.17.0
>
>
> 1. Merge info from metadataStatistics + statisticsKinds into one holder: 
> Map<String, StatisticsHolder>.
> 2. Rename hasStatistics to hasDescriptiveStatistics
> 3. Remove drill-file-metastore-plugin
> 4. Move  
> org.apache.drill.exec.physical.base.AbstractGroupScanWithMetadata.MetadataLevel
>  to metadata module, rename to MetadataType and add new value: SEGMENT.
> 5. Add JSON ser/de for ColumnStatistics, StatisticsHolder.
> 6. Add new info classes:
> {noformat}
> class TableInfo {
>   String storagePlugin;
>   String workspace;
>   String name;
>   String type;
>   String owner;
> }
> class MetadataInfo {
>   public static final String GENERAL_INFO_KEY = "GENERAL_INFO";
>   public static final String DEFAULT_SEGMENT_KEY = "DEFAULT_SEGMENT";
>   MetadataType type (enum);
>   String key;
>   String identifier;
> }
> {noformat}
> 7. Modify existing metadata classes:
> org.apache.drill.metastore.FileTableMetadata
> {noformat}
> missing fields
> ------------------
> storagePlugin, workspace, tableType -> will be covered by TableInfo class
> metadataType, metadataKey -> will be covered by MetadataInfo class
> interestingColumns
> fields to modify
> ----------------
> private final Map<String, Object> tableStatistics;
> private final Map<String, StatisticsKind> statisticsKinds;
> private final Set<String> partitionKeys; -> Map<String, String>
> {noformat}
> org.apache.drill.metastore.PartitionMetadata
> {noformat}
> missing fields
> ------------------
> storagePlugin, workspace -> will be covered by TableInfo class
> metadataType, metadataKey, metadataIdentifier -> will be covered by 
> MetadataInfo class
> partitionValues (List<String>)
> location (String) (for directory level metadata) - directory location
> fields to modify
> ----------------
> private final Map<String, Object> tableStatistics;
> private final Map<String, StatisticsKind> statisticsKinds;
> private final Set<Path> location; -> locations
> {noformat}
> org.apache.drill.metastore.FileMetadata
> {noformat}
> missing fields
> ------------------
> storagePlugin, workspace -> will be covered by TableInfo class
> metadataType, metadataKey, metadataIdentifier -> will be covered by 
> MetadataInfo class
> path - path to file 
> fields to modify
> ----------------
> private final Map<String, Object> tableStatistics;
> private final Map<String, StatisticsKind> statisticsKinds;
> private final Path location; - should contain directory to which file belongs
> {noformat}
> org.apache.drill.metastore.RowGroupMetadata
> {noformat}
> missing fields
> ------------------
> storagePlugin, workspace -> will be covered by TableInfo class
> metadataType, metadataKey, metadataIdentifier -> will be covered by 
> MetadataInfo class
> path - path to file 
> fields to modify
> ----------------
> private final Map<String, Object> tableStatistics;
> private final Map<String, StatisticsKind> statisticsKinds;
> private final Path location; - should contain directory to which file belongs
> {noformat}
> 8. Remove org.apache.drill.exec package from metastore module.
> 9. Rename ColumnStatisticsImpl class.
> 10. Separate existing classes in org.apache.drill.metastore package into 
> sub-packages.
> 11. Rename FileTableMetadata -> BaseTableMetadata
> 12. TableMetadataProvider.getNonInterestingColumnsMeta() -> 
> getNonInterestingColumnsMetadata
> 13. Introduce segment-level metadata class:
> {noformat}
> class SegmentMetadata {
>   TableInfo tableInfo;
>   MetadataInfo metadataInfo;
>   SchemaPath column;
>   TupleMetadata schema;
>   String location;
>   Map<SchemaPath, ColumnStatistics> columnsStatistics;
>   Map<String, StatisticsHolder> statistics;
>   List<String> partitionValues;
>   List<String> locations;
>   long lastModifiedTime;
> }
> {noformat}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (DRILL-7271) Refactor Metadata interfaces and classes to contain all needed information for the File based Metastore

Reply via email to