vvysotskyi commented on a change in pull request #2026: DRILL-7330: Implement metadata usage for all format plugins URL: https://github.com/apache/drill/pull/2026#discussion_r392715737
########## File path: exec/java-exec/src/main/java/org/apache/drill/exec/metastore/store/MetastoreFileTableMetadataProvider.java ########## @@ -15,149 +15,108 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.drill.exec.metastore; +package org.apache.drill.exec.metastore.store; import org.apache.drill.common.expression.SchemaPath; import org.apache.drill.exec.exception.MetadataException; -import org.apache.drill.exec.metastore.MetastoreMetadataProviderManager.MetastoreMetadataProviderConfig; +import org.apache.drill.exec.metastore.MetastoreMetadataProviderManager; import org.apache.drill.exec.planner.common.DrillStatsTable; import org.apache.drill.exec.record.SchemaUtil; import org.apache.drill.exec.record.metadata.TupleMetadata; import org.apache.drill.exec.record.metadata.schema.SchemaProvider; import org.apache.drill.exec.store.dfs.DrillFileSystem; import org.apache.drill.exec.store.dfs.FileSelection; -import org.apache.drill.exec.store.dfs.ReadEntryWithPath; -import org.apache.drill.exec.store.parquet.ParquetFileTableMetadataProviderBuilder; -import org.apache.drill.exec.store.parquet.ParquetReaderConfig; -import org.apache.drill.exec.store.parquet.ParquetTableMetadataProviderImpl; import org.apache.drill.exec.store.parquet.ParquetTableMetadataUtils; import org.apache.drill.exec.util.DrillFileSystemUtil; -import org.apache.drill.metastore.MetastoreRegistry; import org.apache.drill.metastore.components.tables.BasicTablesRequests; import org.apache.drill.metastore.components.tables.MetastoreTableInfo; import org.apache.drill.metastore.metadata.BaseTableMetadata; import org.apache.drill.metastore.metadata.FileMetadata; import org.apache.drill.metastore.metadata.NonInterestingColumnsMetadata; import org.apache.drill.metastore.metadata.PartitionMetadata; -import org.apache.drill.metastore.metadata.RowGroupMetadata; import org.apache.drill.metastore.metadata.SegmentMetadata; import org.apache.drill.metastore.metadata.TableInfo; import org.apache.drill.metastore.metadata.TableMetadata; +import org.apache.drill.metastore.metadata.TableMetadataProvider; +import org.apache.drill.metastore.metadata.TableMetadataProviderBuilder; import org.apache.drill.metastore.statistics.ColumnStatistics; import org.apache.drill.metastore.statistics.ColumnStatisticsKind; import org.apache.drill.metastore.statistics.Statistic; import org.apache.drill.metastore.statistics.StatisticsHolder; import org.apache.drill.metastore.util.SchemaPathUtils; -import org.apache.drill.shaded.guava.com.google.common.collect.LinkedListMultimap; -import org.apache.drill.shaded.guava.com.google.common.collect.Multimap; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; -public class MetastoreParquetTableMetadataProvider implements ParquetTableMetadataProvider { - private static final Logger logger = LoggerFactory.getLogger(MetastoreParquetTableMetadataProvider.class); - - private final BasicTablesRequests basicTablesRequests; - private final TableInfo tableInfo; - private final MetastoreTableInfo metastoreTableInfo; - private final TupleMetadata schema; - private final List<ReadEntryWithPath> entries; - private final List<String> paths; - private final DrillStatsTable statsProvider; - - private final boolean useSchema; - private final boolean useStatistics; - private final boolean fallbackToFileMetadata; - - private BaseTableMetadata tableMetadata; - private Map<Path, SegmentMetadata> segmentsMetadata; - private List<PartitionMetadata> partitions; - private Map<Path, FileMetadata> files; - private Multimap<Path, RowGroupMetadata> rowGroups; - private NonInterestingColumnsMetadata nonInterestingColumnsMetadata; - // stores builder to provide lazy init for fallback ParquetTableMetadataProvider - private final ParquetFileTableMetadataProviderBuilder fallbackBuilder; - private ParquetTableMetadataProvider fallback; - - private MetastoreParquetTableMetadataProvider(List<ReadEntryWithPath> entries, - MetastoreRegistry metastoreRegistry, TableInfo tableInfo, TupleMetadata schema, - ParquetFileTableMetadataProviderBuilder fallbackBuilder, MetastoreMetadataProviderConfig config, DrillStatsTable statsProvider) { - this.basicTablesRequests = metastoreRegistry.get().tables().basicRequests(); - this.tableInfo = tableInfo; - this.metastoreTableInfo = basicTablesRequests.metastoreTableInfo(tableInfo); - this.useSchema = config.useSchema(); - this.useStatistics = config.useStatistics(); - this.fallbackToFileMetadata = config.fallbackToFileMetadata(); - this.schema = schema; - this.entries = entries == null ? new ArrayList<>() : entries; - this.fallbackBuilder = fallbackBuilder; - this.statsProvider = statsProvider; - this.paths = this.entries.stream() - .map(readEntryWithPath -> readEntryWithPath.getPath().toUri().getPath()) - .collect(Collectors.toList()); - } - - @Override - public boolean isUsedMetadataCache() { - return false; - } - - @Override - public Path getSelectionRoot() { - return getTableMetadata().getLocation(); - } - - @Override - public List<ReadEntryWithPath> getEntries() { - return entries; - } - - @Override - public List<RowGroupMetadata> getRowGroupsMeta() { - return new ArrayList<>(getRowGroupsMetadataMap().values()); - } +/** + * Implementation of {@link TableMetadataProvider} which uses Drill Metastore for providing table metadata + * for file-based tables. + */ +public class MetastoreFileTableMetadataProvider implements TableMetadataProvider { + private static final Logger logger = LoggerFactory.getLogger(MetastoreFileTableMetadataProvider.class); + + protected final BasicTablesRequests basicTablesRequests; + protected final TableInfo tableInfo; + protected final MetastoreTableInfo metastoreTableInfo; + protected final TupleMetadata schema; + protected final List<String> paths; + protected final DrillStatsTable statsProvider; + protected final TableMetadataProviderBuilder fallbackBuilder; + + protected final boolean useSchema; + protected final boolean useStatistics; + protected final boolean fallbackToFileMetadata; + + protected BaseTableMetadata tableMetadata; + protected Map<Path, SegmentMetadata> segmentsMetadata; + protected List<PartitionMetadata> partitions; + protected Map<Path, FileMetadata> files; Review comment: For the case of many files, the user can store metadata for partitions only, so Drill will do the pruning at partitions level. Drill is also able to discover that metadata was changed, and for this case, it starts planning the query with a newer version of metadata. Number of attempts is configured using `metastore.retrieval.retry_attempts` option. For the case when the number of attempts is exceeded, the query will be planned without metastore usage. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services