paul-rogers commented on a change in pull request #2026: DRILL-7330: Implement
metadata usage for all format plugins
URL: https://github.com/apache/drill/pull/2026#discussion_r392607366
##########
File path:
exec/java-exec/src/main/java/org/apache/drill/exec/metastore/store/MetastoreFileTableMetadataProvider.java
##########
@@ -291,124 +250,74 @@ public boolean checkMetadataVersion() {
}
}
- private ParquetTableMetadataProvider getFallbackTableMetadataProvider()
throws IOException {
- if (fallback == null) {
- fallback = fallbackBuilder == null ? null : fallbackBuilder.build();
- }
- return fallback;
- }
+ public static class Builder<T extends Builder<T>> implements
FileTableMetadataProviderBuilder<T> {
+ protected final MetastoreMetadataProviderManager metadataProviderManager;
- private void throwIfChanged() {
- if (basicTablesRequests.hasMetastoreTableInfoChanged(metastoreTableInfo)) {
- throw
MetadataException.of(MetadataException.MetadataExceptionType.INCONSISTENT_METADATA);
- }
- }
+ // builder for fallback ParquetFileTableMetadataProvider
+ // for the case when required metadata is absent in Metastore
+ protected final TableMetadataProviderBuilder fallback;
- public static class Builder implements
ParquetFileTableMetadataProviderBuilder {
- private final MetastoreMetadataProviderManager metadataProviderManager;
+ protected TupleMetadata schema;
- private List<ReadEntryWithPath> entries;
- private DrillFileSystem fs;
- private TupleMetadata schema;
+ protected List<String> paths;
private FileSelection selection;
- // builder for fallback ParquetFileTableMetadataProvider
- // for the case when required metadata is absent in Metastore
- private final ParquetFileTableMetadataProviderBuilder fallback;
+ private DrillFileSystem fs;
public Builder(MetastoreMetadataProviderManager source) {
- this.metadataProviderManager = source;
- this.fallback = new
ParquetTableMetadataProviderImpl.Builder(FileSystemMetadataProviderManager.init());
+ this(source, new
SimpleFileTableMetadataProvider.Builder(FileSystemMetadataProviderManager.init()));
}
- @Override
- public ParquetFileTableMetadataProviderBuilder
withEntries(List<ReadEntryWithPath> entries) {
- this.entries = entries;
- fallback.withEntries(entries);
- return this;
+ protected Builder(MetastoreMetadataProviderManager source,
TableMetadataProviderBuilder fallback) {
+ this.metadataProviderManager = source;
+ this.fallback = fallback;
}
@Override
- public ParquetFileTableMetadataProviderBuilder withSelectionRoot(Path
selectionRoot) {
- fallback.withSelectionRoot(selectionRoot);
- return this;
+ public T withSchema(TupleMetadata schema) {
+ this.schema = schema;
+ return self();
}
- @Override
- public ParquetFileTableMetadataProviderBuilder withCacheFileRoot(Path
cacheFileRoot) {
- fallback.withCacheFileRoot(cacheFileRoot);
- return this;
+ public T withSelection(FileSelection selection) {
+ this.selection = selection;
+ return self();
}
- @Override
- public ParquetFileTableMetadataProviderBuilder
withReaderConfig(ParquetReaderConfig readerConfig) {
- fallback.withReaderConfig(readerConfig);
- return this;
+ public T withFileSystem(DrillFileSystem fs) {
+ this.fs = fs;
+ return self();
}
- @Override
- public ParquetFileTableMetadataProviderBuilder
withFileSystem(DrillFileSystem fs) {
- fallback.withFileSystem(fs);
- this.fs = fs;
- return this;
+ protected T self() {
+ return (T) this;
}
- @Override
- public ParquetFileTableMetadataProviderBuilder
withCorrectCorruptedDates(boolean autoCorrectCorruptedDates) {
- fallback.withCorrectCorruptedDates(autoCorrectCorruptedDates);
- return this;
+ public MetastoreMetadataProviderManager metadataProviderManager() {
+ return metadataProviderManager;
}
- @Override
- public ParquetFileTableMetadataProviderBuilder withSelection(FileSelection
selection) {
- fallback.withSelection(selection);
- this.selection = selection;
- return this;
+ public FileSelection selection() {
+ return selection;
}
- @Override
- public ParquetFileTableMetadataProviderBuilder withSchema(TupleMetadata
schema) {
- fallback.withSchema(schema);
- this.schema = schema;
- return this;
+ public DrillFileSystem fs() {
+ return fs;
}
@Override
- public ParquetTableMetadataProvider build() throws IOException {
- MetastoreParquetTableMetadataProvider provider;
- SchemaProvider schemaProvider =
metadataProviderManager.getSchemaProvider();
- ParquetMetadataProvider source = (ParquetTableMetadataProvider)
metadataProviderManager.getTableMetadataProvider();
-
- DrillStatsTable statsProvider =
metadataProviderManager.getStatsProvider();
- // schema passed into the builder has greater priority
- try {
- if (this.schema == null) {
- schema = schemaProvider != null ? schemaProvider.read().getSchema()
: null;
- }
- } catch (IOException e) {
- logger.debug("Unable to deserialize schema from schema file for table:
{}", metadataProviderManager.getTableInfo().name(), e);
- }
- if (entries == null) {
- if (!selection.isExpandedFully()) {
- entries = DrillFileSystemUtil.listFiles(fs,
selection.getSelectionRoot(), true).stream()
- .map(fileStatus -> new
ReadEntryWithPath(Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath())))
- .collect(Collectors.toList());
- } else {
- entries = selection.getFiles().stream()
- .map(Path::getPathWithoutSchemeAndAuthority)
- .map(ReadEntryWithPath::new)
- .collect(Collectors.toList());
- }
- }
- provider = new MetastoreParquetTableMetadataProvider(entries,
metadataProviderManager.getMetastoreRegistry(),
- metadataProviderManager.getTableInfo(), schema, fallback,
metadataProviderManager.getConfig(), statsProvider);
- // store results into metadataProviderManager to be able to use them
when creating new instances
- // for the case when source wasn't provided or it contains less row
group metadata than the provider
- if (source == null || source.getRowGroupsMeta().size() <
provider.getRowGroupsMeta().size()) {
- metadataProviderManager.setTableMetadataProvider(provider);
+ public TableMetadataProvider build() throws IOException {
+ if (!selection().isExpandedFully()) {
+ paths = DrillFileSystemUtil.listFiles(fs,
selection.getSelectionRoot(), true).stream()
+ .map(fileStatus ->
Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath()).toUri().getPath())
+ .collect(Collectors.toList());
+ } else {
+ paths = selection.getFiles().stream()
+ .map(path ->
Path.getPathWithoutSchemeAndAuthority(path).toUri().getPath())
+ .collect(Collectors.toList());
Review comment:
Showing my ignorance here, but is this information cached between queries?
Reading this info per file for very large data sets will be expensive (which is
why Impala caches the metadata, which leads to the version issues discussed
above.)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services