rbalamohan commented on a change in pull request #1645:
URL: https://github.com/apache/hive/pull/1645#discussion_r522558601
##########
File path:
ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java
##########
@@ -93,94 +91,109 @@ private String getAliasForTableScanOperator(MapWork work,
return null;
}
- private PartitionDesc changePartitionToMetadataOnly(PartitionDesc desc,
- Path path) {
- if (desc == null) {
- return null;
- }
- FileStatus[] filesFoundInPartitionDir = null;
+ private void lookupAndProcessPath(MapWork work, Path path,
+ Collection<String> aliasesToOptimize) {
try {
- filesFoundInPartitionDir =
Utilities.listNonHiddenFileStatus(physicalContext.getConf(), path);
+ boolean isEmpty =
Utilities.listNonHiddenFileStatus(physicalContext.getConf(), path).length == 0;
+ processPath(work, path, aliasesToOptimize, isEmpty);
} catch (IOException e) {
- LOG.error("Cannot determine if the table is empty", e);
- }
- if (!isMetadataOnlyAllowed(filesFoundInPartitionDir)) {
- return desc;
+ LOG.warn("Could not determine if path {} was empty." +
+ "Cannot use null scan optimization for this path.", path, e);
}
-
- boolean isEmpty = filesFoundInPartitionDir == null ||
filesFoundInPartitionDir.length == 0;
- desc.setInputFileFormatClass(isEmpty ? ZeroRowsInputFormat.class :
OneNullRowInputFormat.class);
- desc.setOutputFileFormatClass(HiveIgnoreKeyTextOutputFormat.class);
- desc.getProperties().setProperty(serdeConstants.SERIALIZATION_LIB,
- NullStructSerDe.class.getName());
- return desc;
}
- private boolean isMetadataOnlyAllowed(FileStatus[] filesFoundInPartitionDir)
{
- if (filesFoundInPartitionDir == null || filesFoundInPartitionDir.length ==
0) {
- return true; // empty folders are safe to convert to metadata-only
- }
- for (FileStatus f : filesFoundInPartitionDir) {
- if (AcidUtils.isDeleteDelta(f.getPath())) {
- /*
- * as described in HIVE-23712, an acid partition is not a safe subject
of metadata-only
- * optimization, because there is a chance that it contains no data
but contains folders
- * (e.g: delta_0000002_0000002_0000,
delete_delta_0000003_0000003_0000), without scanning
- * the underlying file contents, we cannot tell whether this partition
contains data or not
- */
- return false;
- }
+ private void processPath(MapWork work, Path path, Collection<String>
aliasesToOptimize,
+ boolean isEmpty) {
+ PartitionDesc partDesc = work.getPathToPartitionInfo().get(path).clone();
+ partDesc.setInputFileFormatClass(isEmpty ? ZeroRowsInputFormat.class :
OneNullRowInputFormat.class);
+ partDesc.setOutputFileFormatClass(HiveIgnoreKeyTextOutputFormat.class);
+ partDesc.getProperties().setProperty(serdeConstants.SERIALIZATION_LIB,
+ NullStructSerDe.class.getName());
+ Path fakePath =
+ new Path(NullScanFileSystem.getBase() + partDesc.getTableName()
+ + "/part" + encode(partDesc.getPartSpec()));
+ StringInternUtils.internUriStringsInPath(fakePath);
+ work.addPathToPartitionInfo(fakePath, partDesc);
+ work.addPathToAlias(fakePath, new ArrayList<>(aliasesToOptimize));
+ Collection<String> aliasesContainingPath =
work.getPathToAliases().get(path);
+ aliasesContainingPath.removeAll(aliasesToOptimize);
+ if (aliasesContainingPath.isEmpty()) {
+ work.removePathToAlias(path);
+ work.removePathToPartitionInfo(path);
}
- return true;
}
- private void processAlias(MapWork work, Path path,
- Collection<String> aliasesAffected, Set<String> aliases) {
- // the aliases that are allowed to map to a null scan.
- Collection<String> allowed = aliasesAffected.stream()
- .filter(a -> aliases.contains(a)).collect(Collectors.toList());
- if (!allowed.isEmpty()) {
- PartitionDesc partDesc = work.getPathToPartitionInfo().get(path).clone();
- PartitionDesc newPartition =
- changePartitionToMetadataOnly(partDesc, path);
- // Prefix partition with something to avoid it being a hidden file.
- Path fakePath =
- new Path(NullScanFileSystem.getBase() + newPartition.getTableName()
- + "/part" + encode(newPartition.getPartSpec()));
- StringInternUtils.internUriStringsInPath(fakePath);
- work.addPathToPartitionInfo(fakePath, newPartition);
- work.addPathToAlias(fakePath, new ArrayList<>(allowed));
- aliasesAffected.removeAll(allowed);
- if (aliasesAffected.isEmpty()) {
- work.removePathToAlias(path);
- work.removePathToPartitionInfo(path);
- }
- }
+ private void addCandidatePath(Map<Path, Collection<String>> aliasesForPath,
Path path, String alias) {
+ Collection<String> aliases = aliasesForPath.computeIfAbsent(path, k -> new
ArrayList<>());
+ aliases.add(alias);
}
- private void processAlias(MapWork work, Set<TableScanOperator> tableScans) {
- Set<String> aliases = new HashSet<>();
+ private void processTableScans(MapWork work, Set<TableScanOperator>
tableScans) {
+ Map<Path, Boolean> managedEmptyPathMap = new HashMap<>();
+ Map<Path, Collection<String>> candidatePathsToAliases = new HashMap<>();
+ Map<String, Boolean> aliasTypeMap = new HashMap<>();
+ Map<String, Map<Path, Partition>> allowedAliasesToPartitions = new
HashMap<>();
for (TableScanOperator tso : tableScans) {
// use LinkedHashMap<String, Operator<? extends OperatorDesc>>
// getAliasToWork() should not apply this for non-native table
if (tso.getConf().getTableMetadata().getStorageHandler() != null) {
continue;
}
String alias = getAliasForTableScanOperator(work, tso);
- aliases.add(alias);
+ boolean isManagedTable =
!MetaStoreUtils.isExternalTable(tso.getConf().getTableMetadata().getTTable());
+ aliasTypeMap.put(alias, isManagedTable);
+ allowedAliasesToPartitions.put(alias, getPathToPartitionMap(alias, tso));
tso.getConf().setIsMetadataOnly(true);
}
- // group path alias according to work
- Map<Path, List<String>> candidates = new HashMap<>();
+
for (Path path : work.getPaths()) {
- List<String> aliasesAffected = work.getPathToAliases().get(path);
- if (CollectionUtils.isNotEmpty(aliasesAffected)) {
- candidates.put(path, aliasesAffected);
+ List<String> aliases = work.getPathToAliases().get(path);
+ for (String alias: aliases) {
+ Map<Path, Partition> pathToPartitionMap =
allowedAliasesToPartitions.get(alias);
+ if (pathToPartitionMap == null) {
+ continue;
+ }
+ addCandidatePath(candidatePathsToAliases, path, alias);
+ Partition partitionObject = pathToPartitionMap.get(path);
+ Map<String, String> partitionParameters = partitionObject != null ?
partitionObject.getParameters() : null;
+ boolean isManagedTable = aliasTypeMap.get(alias);
+ if (isManagedTable && partitionParameters != null &&
StatsSetupConst.areBasicStatsUptoDate(partitionParameters)) {
+ long rwCount =
Long.parseLong(partitionParameters.get(StatsSetupConst.ROW_COUNT));
+ if (rwCount == 0) {
+ managedEmptyPathMap.put(path, true);
+ } else {
+ managedEmptyPathMap.put(path, false);
+ }
+ }
+ }
+ }
+
+ for (Entry<Path, Collection<String>> entry :
candidatePathsToAliases.entrySet()) {
Review comment:
Thanks for sharing the details. HIVE-24380 is what you are referring to
I believe.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]