This is an automated email from the ASF dual-hosted git repository.
abstractdog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 3551d8a87ab HIVE-27723: Prevent localizing the same original file more
than once if symlinks are present (#4753) (Laszlo Bodor reviewed by Ayush
Saxena)
3551d8a87ab is described below
commit 3551d8a87ab8c0e77f7dfe5abde7ff875214a59d
Author: Bodor Laszlo <[email protected]>
AuthorDate: Thu Oct 5 06:51:54 2023 +0200
HIVE-27723: Prevent localizing the same original file more than once if
symlinks are present (#4753) (Laszlo Bodor reviewed by Ayush Saxena)
---
.../org/apache/hadoop/hive/common/FileUtils.java | 51 ++++++++++++++++
.../apache/hadoop/hive/common/TestFileUtils.java | 59 ++++++++++++++++++
.../apache/hadoop/hive/ql/exec/tez/DagUtils.java | 9 +--
.../hadoop/hive/ql/exec/tez/TezSessionState.java | 7 ++-
.../hive/ql/exec/tez/TestTezSessionState.java | 70 ++++++++++++++++++++++
5 files changed, 190 insertions(+), 6 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
index 31cd923a870..18efe167a63 100644
--- a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
+++ b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
@@ -27,6 +27,7 @@ import java.io.UncheckedIOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
+import java.nio.file.Paths;
import java.security.AccessControlException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
@@ -1387,6 +1388,56 @@ public final class FileUtils {
status -> filter.accept(status.getPath()));
}
+ /**
+ * Resolves a symlink on a local filesystem. In case of any exceptions or
scheme other than "file"
+ * it simply returns the original path. Refer to DEBUG level logs for
further details.
+ * @param path input path to be resolved
+ * @param conf a Configuration instance to be used while e.g. resolving the
FileSystem if necessary
+ * @return the resolved target Path or the original if the input Path is not
a symlink
+ * @throws IOException
+ */
+ public static Path resolveSymlinks(Path path, Configuration conf) throws
IOException {
+ if (path == null) {
+ throw new IllegalArgumentException("Cannot resolve symlink for a null
Path");
+ }
+
+ URI uri = path.toUri();
+ String scheme = uri.getScheme();
+
+ /*
+ * If you're about to extend this method to e.g. HDFS, simply remove this
check.
+ * There is a known exception reproduced by whroot_external1.q, which can
be referred to,
+ * which is because java.nio is not prepared by default for other schemes
like "hdfs".
+ */
+ if (scheme != null && !"file".equalsIgnoreCase(scheme)) {
+ LOG.debug("scheme '{}' is not supported for resolving symlinks", scheme);
+ return path;
+ }
+
+ // we're expecting 'file' scheme, so if scheme == null, we need to add it
to path before resolving,
+ // otherwise Paths.get will fail with java.lang.IllegalArgumentException:
Missing scheme
+ if (scheme == null) {
+ try {
+ uri = new URI("file", uri.getAuthority(), uri.toString(), null, null);
+ } catch (URISyntaxException e) {
+ // e.g. in case of relative URI, we cannot create a new URI
+ LOG.debug("URISyntaxException while creating uri from path without
scheme {}", path, e);
+ return path;
+ }
+ }
+
+ try {
+ java.nio.file.Path srcPath = Paths.get(uri);
+ URI targetUri = srcPath.toRealPath().toUri();
+ // stick to the original scheme
+ return new Path(scheme, targetUri.getAuthority(),
+ Path.getPathWithoutSchemeAndAuthority(new
Path(targetUri)).toString());
+ } catch (Exception e) {
+ LOG.debug("Exception while calling toRealPath of {}", path, e);
+ return path;
+ }
+ }
+
public static class AdaptingIterator<T> implements Iterator<T> {
private final RemoteIterator<T> iterator;
diff --git a/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
b/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
index c14af90f935..647e67ceb5d 100644
--- a/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
+++ b/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
@@ -338,4 +338,63 @@ public class TestFileUtils {
}
return count;
}
+
+ @Test
+ public void testResolveSymlinks() throws IOException {
+ HiveConf conf = new HiveConf();
+
+ java.nio.file.Path original = java.nio.file.Files.createTempFile("", "");
+ java.nio.file.Path symlinkPath =
java.nio.file.Paths.get(original.toString() + ".symlink");
+ java.nio.file.Path symlinkOfSymlinkPath =
java.nio.file.Paths.get(original.toString() + ".symlink.symlink");
+
+ // symlink -> original
+ java.nio.file.Files.createSymbolicLink(symlinkPath, original);
+ // symlink -> symlink -> original
+ java.nio.file.Files.createSymbolicLink(symlinkOfSymlinkPath, symlinkPath);
+
+ Assert.assertTrue(java.nio.file.Files.isSymbolicLink(symlinkPath));
+
Assert.assertTrue(java.nio.file.Files.isSymbolicLink(symlinkOfSymlinkPath));
+
+ // average usage 1: symlink points to the original
+ Path originalPathResolved = FileUtils.resolveSymlinks(new
Path(symlinkPath.toUri()), conf);
+ Assert.assertEquals(original.toUri(), originalPathResolved.toUri());
+
+ // average usage 2: symlink2 -> symlink -> original points to the original
+ Path originalPathResolved2 = FileUtils.resolveSymlinks(new
Path(symlinkOfSymlinkPath.toUri()), conf);
+ Assert.assertEquals(original.toUri(), originalPathResolved2.toUri());
+
+ // providing a symlink path without scheme: still resolving it as it was
'file' scheme
+ // resolve to the original path then returning without scheme
+ Path originalPathWithoutScheme = Path.getPathWithoutSchemeAndAuthority(new
Path(original.toUri()));
+ Path symlinkPathWithoutScheme = Path.getPathWithoutSchemeAndAuthority(new
Path(symlinkPath.toUri()));
+ Assert.assertNull(originalPathWithoutScheme.toUri().getScheme());
+ Assert.assertNull(symlinkPathWithoutScheme.toUri().getScheme());
+
+ Path originalPathResolvedWithoutInputScheme =
FileUtils.resolveSymlinks(symlinkPathWithoutScheme, conf);
+ // return path also hasn't got a scheme
+ Assert.assertNull("Path without scheme should be resolved to another Path
without scheme",
+ originalPathResolvedWithoutInputScheme.toUri().getScheme());
+ Assert.assertEquals(originalPathResolvedWithoutInputScheme,
originalPathWithoutScheme);
+
+ // a non-symlink is resolved to itself
+ Path originalPathResolvedFromOriginal = FileUtils.resolveSymlinks(new
Path(original.toUri()), conf);
+ Assert.assertEquals(original.toUri(),
originalPathResolvedFromOriginal.toUri());
+
+ // 1. a nonexistent path is resolved to itself, resolveSymlinks doesn't
care if the path doesn't exist
+ // 2. a relative path without a scheme cannot be used to construct an URI,
hence we get the input Path back
+ Path nonexistentPath = new Path("./nonexistent-" +
System.currentTimeMillis());
+ Assert.assertEquals(nonexistentPath.toUri(),
FileUtils.resolveSymlinks(nonexistentPath, conf).toUri());
+
+ try {
+ FileUtils.resolveSymlinks(null, conf);
+ Assert.fail("IllegalArgumentException should be thrown in case of null
input");
+ } catch (IllegalArgumentException e) {
+ Assert.assertEquals("Cannot resolve symlink for a null Path",
e.getMessage());
+ }
+
+ // hdfs is not supported, return safely with the original path
+ Path hdfsPath = new Path("hdfs://localhost:0/user/hive/warehouse/src");
+ Path resolvedHdfsPath = FileUtils.resolveSymlinks(hdfsPath, conf);
+ Assert.assertEquals(hdfsPath.toUri(), resolvedHdfsPath.toUri());
+ }
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
index 4096bb14777..b9be333761e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
@@ -1217,13 +1217,14 @@ public class DagUtils {
if (!StringUtils.isNotBlank(file)) {
continue;
}
- if (skipFileSet != null && skipFileSet.contains(new Path(file))) {
+ Path path = new Path(file);
+ if (skipFileSet != null && skipFileSet.contains(path)) {
LOG.info("Skipping vertex resource " + file + " that already exists in
the session");
continue;
}
- Path hdfsFilePath = new Path(hdfsDirPathStr, getResourceBaseName(new
Path(file)));
- LocalResource localResource = localizeResource(new Path(file),
- hdfsFilePath, type, conf);
+ path = FileUtils.resolveSymlinks(path, conf);
+ Path hdfsFilePath = new Path(hdfsDirPathStr, getResourceBaseName(path));
+ LocalResource localResource = localizeResource(path, hdfsFilePath, type,
conf);
tmpResourcesMap.put(file, localResource);
}
return tmpResourcesMap;
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionState.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionState.java
index ad79fce936b..015e826d8e0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionState.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionState.java
@@ -47,6 +47,7 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConfUtil;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
@@ -92,6 +93,7 @@ import
org.apache.hadoop.hive.ql.exec.tez.monitoring.TezJobMonitor;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import com.google.common.annotations.VisibleForTesting;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
@@ -815,7 +817,8 @@ public class TezSessionState {
* @throws LoginException when we are unable to determine the user.
* @throws URISyntaxException when current jar location cannot be determined.
*/
- private LocalResource createJarLocalResource(String localJarPath)
+ @VisibleForTesting
+ LocalResource createJarLocalResource(String localJarPath)
throws IOException, LoginException, IllegalArgumentException {
// TODO Reduce the number of lookups that happen here. This shouldn't go
to HDFS for each call.
// The hiveJarDir can be determined once per client.
@@ -823,7 +826,7 @@ public class TezSessionState {
assert destDirStatus != null;
Path destDirPath = destDirStatus.getPath();
- Path localFile = new Path(localJarPath);
+ Path localFile = FileUtils.resolveSymlinks(new Path(localJarPath), conf);
String sha = getSha(localFile);
String destFileName = localFile.getName();
diff --git
a/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestTezSessionState.java
b/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestTezSessionState.java
new file mode 100644
index 00000000000..521134bdfa5
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestTezSessionState.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.tez;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.yarn.api.records.LocalResource;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestTezSessionState {
+
+ private class TestTezSessionPoolManager extends TezSessionPoolManager {
+ public TestTezSessionPoolManager() {
+ super();
+ }
+
+ @Override
+ public void setupPool(HiveConf conf) throws Exception {
+ super.setupPool(conf);
+ }
+
+ @Override
+ public TezSessionPoolSession createSession(String sessionId, HiveConf
conf) {
+ return new SampleTezSessionState(sessionId, this, conf);
+ }
+ }
+
+ @Test
+ public void testSymlinkedLocalFilesAreLocalizedOnce() throws Exception {
+ Path jarPath = Files.createTempFile("jar", "");
+ Path symlinkPath = Paths.get(jarPath.toString() + ".symlink");
+ Files.createSymbolicLink(symlinkPath, jarPath);
+
+ // write some data into the fake jar, it's not a 0 length file in real life
+ Files.write(jarPath, "testSymlinkedLocalFilesToBeLocalized".getBytes(),
StandardOpenOption.APPEND);
+
+ Assert.assertTrue(Files.isSymbolicLink(symlinkPath));
+
+ HiveConf hiveConf = new HiveConf();
+ hiveConf.set(HiveConf.ConfVars.HIVE_JAR_DIRECTORY.varname, "/tmp");
+ TezSessionPoolManager poolManager = new TestTezSessionPoolManager();
+
+ TezSessionState sessionState = poolManager.getSession(null, hiveConf,
true, false);
+
+ LocalResource l1 =
sessionState.createJarLocalResource(jarPath.toUri().toString());
+ LocalResource l2 =
sessionState.createJarLocalResource(symlinkPath.toUri().toString());
+
+ // local resources point to the same original resource
+ Assert.assertEquals(l1.getResource().toPath(), l2.getResource().toPath());
+ }
+}
\ No newline at end of file