deniskuzZ commented on code in PR #5934: URL: https://github.com/apache/hive/pull/5934#discussion_r2188343170
########## ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java: ########## @@ -71,8 +70,8 @@ public class SplitGrouper { // TODO This needs to be looked at. Map of Map to Map... Made concurrent for now since split generation // can happen in parallel. - private static final Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cache = - new ConcurrentHashMap<>(); + private final Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cache = Review Comment: updated ```` From c62bbba350e9168d5bab4e727f1f5e18ff8fc26d Mon Sep 17 00:00:00 2001 From: Denys Kuzmenko <denisk...@gmail.com> Date: Sun, 6 Jul 2025 15:40:18 +0300 Subject: [PATCH 1/1] test --- .../hadoop/hive/ql/exec/tez/SplitGrouper.java | 5 +-- .../hive/ql/io/HiveFileFormatUtils.java | 39 +++++++++---------- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java index 2ce7ed88c8..f867c251a8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/SplitGrouper.java @@ -69,10 +69,7 @@ public class SplitGrouper { private static final Logger LOG = LoggerFactory.getLogger(SplitGrouper.class); - // TODO This needs to be looked at. Map of Map to Map... Made concurrent for now since split generation - // can happen in parallel. - private static final Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cache = - new ConcurrentHashMap<>(); + private final Map<Path, Path> cache = new ConcurrentHashMap<>(); private final TezMapredSplitsGrouper tezGrouper = new TezMapredSplitsGrouper(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java index 20fde98c26..644ce9a3fd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java @@ -360,17 +360,17 @@ private static RecordUpdater getRecordUpdater(JobConf jc, } public static <T> T getFromPathRecursively(Map<Path, T> pathToPartitionInfo, Path dir, - Map<Map<Path, T>, Map<Path, T>> cacheMap) throws IOException { + Map<Path, Path> cacheMap) throws IOException { return getFromPathRecursively(pathToPartitionInfo, dir, cacheMap, false); } public static <T> T getFromPathRecursively(Map<Path, T> pathToPartitionInfo, Path dir, - Map<Map<Path, T>, Map<Path, T>> cacheMap, boolean ignoreSchema) throws IOException { + Map<Path, Path> cacheMap, boolean ignoreSchema) throws IOException { return getFromPathRecursively(pathToPartitionInfo, dir, cacheMap, ignoreSchema, false); } public static <T> T getFromPathRecursively(Map<Path, T> pathToPartitionInfo, Path dir, - Map<Map<Path, T>, Map<Path, T>> cacheMap, boolean ignoreSchema, boolean ifPresent) + Map<Path, Path> cacheMap, boolean ignoreSchema, boolean ifPresent) throws IOException { T part = getFromPath(pathToPartitionInfo, dir); @@ -379,18 +379,7 @@ public static <T> T getFromPathRecursively(Map<Path, T> pathToPartitionInfo, Pat || (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim().equals("")) || FileUtils.pathsContainNoScheme(pathToPartitionInfo.keySet()))) { - Map<Path, T> newPathToPartitionInfo = null; - if (cacheMap != null) { - newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo); - } - - if (newPathToPartitionInfo == null) { // still null - newPathToPartitionInfo = populateNewT(pathToPartitionInfo); - - if (cacheMap != null) { - cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo); - } - } + Map<Path, T> newPathToPartitionInfo = populateNewT(pathToPartitionInfo, cacheMap); part = getFromPath(newPathToPartitionInfo, dir); } if (part != null || ifPresent) { @@ -401,13 +390,21 @@ public static <T> T getFromPathRecursively(Map<Path, T> pathToPartitionInfo, Pat } } - private static <T> Map<Path, T> populateNewT(Map<Path, T> pathToPartitionInfo) { - Map<Path, T> newPathToPartitionInfo = new HashMap<>(); - for (Map.Entry<Path, T> entry: pathToPartitionInfo.entrySet()) { - T partDesc = entry.getValue(); - Path pathOnly = Path.getPathWithoutSchemeAndAuthority(entry.getKey()); + private static <T> Map<Path, T> populateNewT(Map<Path, T> pathToPartitionInfo, + Map<Path, Path> cacheMap) { + Map<Path, T> newPathToPartitionInfo = new HashMap<>(pathToPartitionInfo.size()); + + pathToPartitionInfo.forEach((originalPath, partDesc) -> { + Path pathOnly = cacheMap != null ? + cacheMap.get(originalPath) : null; + if (pathOnly == null) { + pathOnly = Path.getPathWithoutSchemeAndAuthority(originalPath); + if (cacheMap != null) { + cacheMap.put(originalPath, pathOnly); + } + } newPathToPartitionInfo.put(pathOnly, partDesc); - } + }); return newPathToPartitionInfo; } -- 2.48.0 ```` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For additional commands, e-mail: gitbox-h...@hive.apache.org