This is an automated email from the ASF dual-hosted git repository.

abstractdog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 3551d8a87ab HIVE-27723: Prevent localizing the same original file more 
than once if symlinks are present (#4753) (Laszlo Bodor reviewed by Ayush 
Saxena)
3551d8a87ab is described below

commit 3551d8a87ab8c0e77f7dfe5abde7ff875214a59d
Author: Bodor Laszlo <[email protected]>
AuthorDate: Thu Oct 5 06:51:54 2023 +0200

    HIVE-27723: Prevent localizing the same original file more than once if 
symlinks are present (#4753) (Laszlo Bodor reviewed by Ayush Saxena)
---
 .../org/apache/hadoop/hive/common/FileUtils.java   | 51 ++++++++++++++++
 .../apache/hadoop/hive/common/TestFileUtils.java   | 59 ++++++++++++++++++
 .../apache/hadoop/hive/ql/exec/tez/DagUtils.java   |  9 +--
 .../hadoop/hive/ql/exec/tez/TezSessionState.java   |  7 ++-
 .../hive/ql/exec/tez/TestTezSessionState.java      | 70 ++++++++++++++++++++++
 5 files changed, 190 insertions(+), 6 deletions(-)

diff --git a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java 
b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
index 31cd923a870..18efe167a63 100644
--- a/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
+++ b/common/src/java/org/apache/hadoop/hive/common/FileUtils.java
@@ -27,6 +27,7 @@ import java.io.UncheckedIOException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.nio.ByteBuffer;
+import java.nio.file.Paths;
 import java.security.AccessControlException;
 import java.security.PrivilegedExceptionAction;
 import java.util.ArrayList;
@@ -1387,6 +1388,56 @@ public final class FileUtils {
         status -> filter.accept(status.getPath()));
   }
 
+  /**
+   * Resolves a symlink on a local filesystem. In case of any exceptions or 
scheme other than "file"
+   * it simply returns the original path. Refer to DEBUG level logs for 
further details.
+   * @param path input path to be resolved
+   * @param conf a Configuration instance to be used while e.g. resolving the 
FileSystem if necessary
+   * @return the resolved target Path or the original if the input Path is not 
a symlink
+   * @throws IOException
+   */
+  public static Path resolveSymlinks(Path path, Configuration conf) throws 
IOException {
+    if (path == null) {
+      throw new IllegalArgumentException("Cannot resolve symlink for a null 
Path");
+    }
+
+    URI uri = path.toUri();
+    String scheme = uri.getScheme();
+
+    /*
+     * If you're about to extend this method to e.g. HDFS, simply remove this 
check.
+     * There is a known exception reproduced by whroot_external1.q, which can 
be referred to,
+     * which is because java.nio is not prepared by default for other schemes 
like "hdfs".
+     */
+    if (scheme != null && !"file".equalsIgnoreCase(scheme)) {
+      LOG.debug("scheme '{}' is not supported for resolving symlinks", scheme);
+      return path;
+    }
+
+    // we're expecting 'file' scheme, so if scheme == null, we need to add it 
to path before resolving,
+    // otherwise Paths.get will fail with java.lang.IllegalArgumentException: 
Missing scheme
+    if (scheme == null) {
+      try {
+        uri =  new URI("file", uri.getAuthority(), uri.toString(), null, null);
+      } catch (URISyntaxException e) {
+        // e.g. in case of relative URI, we cannot create a new URI
+        LOG.debug("URISyntaxException while creating uri from path without 
scheme {}", path, e);
+        return path;
+      }
+    }
+
+    try {
+      java.nio.file.Path srcPath = Paths.get(uri);
+      URI targetUri = srcPath.toRealPath().toUri();
+      // stick to the original scheme
+      return new Path(scheme, targetUri.getAuthority(),
+          Path.getPathWithoutSchemeAndAuthority(new 
Path(targetUri)).toString());
+    } catch (Exception e) {
+      LOG.debug("Exception while calling toRealPath of {}", path, e);
+      return path;
+    }
+  }
+
   public static class AdaptingIterator<T> implements Iterator<T> {
 
     private final RemoteIterator<T> iterator;
diff --git a/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java 
b/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
index c14af90f935..647e67ceb5d 100644
--- a/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
+++ b/common/src/test/org/apache/hadoop/hive/common/TestFileUtils.java
@@ -338,4 +338,63 @@ public class TestFileUtils {
     }
     return count;
   }
+
+  @Test
+  public void testResolveSymlinks() throws IOException {
+    HiveConf conf = new HiveConf();
+
+    java.nio.file.Path original = java.nio.file.Files.createTempFile("", "");
+    java.nio.file.Path symlinkPath = 
java.nio.file.Paths.get(original.toString() + ".symlink");
+    java.nio.file.Path symlinkOfSymlinkPath = 
java.nio.file.Paths.get(original.toString() + ".symlink.symlink");
+
+    // symlink -> original
+    java.nio.file.Files.createSymbolicLink(symlinkPath, original);
+    // symlink -> symlink -> original
+    java.nio.file.Files.createSymbolicLink(symlinkOfSymlinkPath, symlinkPath);
+
+    Assert.assertTrue(java.nio.file.Files.isSymbolicLink(symlinkPath));
+    
Assert.assertTrue(java.nio.file.Files.isSymbolicLink(symlinkOfSymlinkPath));
+
+    // average usage 1: symlink points to the original
+    Path originalPathResolved = FileUtils.resolveSymlinks(new 
Path(symlinkPath.toUri()), conf);
+    Assert.assertEquals(original.toUri(), originalPathResolved.toUri());
+
+    // average usage 2: symlink2 -> symlink -> original points to the original
+    Path originalPathResolved2 = FileUtils.resolveSymlinks(new 
Path(symlinkOfSymlinkPath.toUri()), conf);
+    Assert.assertEquals(original.toUri(), originalPathResolved2.toUri());
+
+    // providing a symlink path without scheme: still resolving it as it was 
'file' scheme
+    // resolve to the original path then returning without scheme
+    Path originalPathWithoutScheme = Path.getPathWithoutSchemeAndAuthority(new 
Path(original.toUri()));
+    Path symlinkPathWithoutScheme = Path.getPathWithoutSchemeAndAuthority(new 
Path(symlinkPath.toUri()));
+    Assert.assertNull(originalPathWithoutScheme.toUri().getScheme());
+    Assert.assertNull(symlinkPathWithoutScheme.toUri().getScheme());
+
+    Path originalPathResolvedWithoutInputScheme = 
FileUtils.resolveSymlinks(symlinkPathWithoutScheme, conf);
+    // return path also hasn't got a scheme
+    Assert.assertNull("Path without scheme should be resolved to another Path 
without scheme",
+        originalPathResolvedWithoutInputScheme.toUri().getScheme());
+    Assert.assertEquals(originalPathResolvedWithoutInputScheme, 
originalPathWithoutScheme);
+
+    // a non-symlink is resolved to itself
+    Path originalPathResolvedFromOriginal = FileUtils.resolveSymlinks(new 
Path(original.toUri()), conf);
+    Assert.assertEquals(original.toUri(), 
originalPathResolvedFromOriginal.toUri());
+
+    // 1. a nonexistent path is resolved to itself, resolveSymlinks doesn't 
care if the path doesn't exist
+    // 2. a relative path without a scheme cannot be used to construct an URI, 
hence we get the input Path back
+    Path nonexistentPath = new Path("./nonexistent-" + 
System.currentTimeMillis());
+    Assert.assertEquals(nonexistentPath.toUri(), 
FileUtils.resolveSymlinks(nonexistentPath, conf).toUri());
+
+    try {
+      FileUtils.resolveSymlinks(null, conf);
+      Assert.fail("IllegalArgumentException should be thrown in case of null 
input");
+    } catch (IllegalArgumentException e) {
+      Assert.assertEquals("Cannot resolve symlink for a null Path", 
e.getMessage());
+    }
+
+    // hdfs is not supported, return safely with the original path
+    Path hdfsPath = new Path("hdfs://localhost:0/user/hive/warehouse/src");
+    Path resolvedHdfsPath = FileUtils.resolveSymlinks(hdfsPath, conf);
+    Assert.assertEquals(hdfsPath.toUri(), resolvedHdfsPath.toUri());
+  }
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
index 4096bb14777..b9be333761e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
@@ -1217,13 +1217,14 @@ public class DagUtils {
       if (!StringUtils.isNotBlank(file)) {
         continue;
       }
-      if (skipFileSet != null && skipFileSet.contains(new Path(file))) {
+      Path path = new Path(file);
+      if (skipFileSet != null && skipFileSet.contains(path)) {
         LOG.info("Skipping vertex resource " + file + " that already exists in 
the session");
         continue;
       }
-      Path hdfsFilePath = new Path(hdfsDirPathStr, getResourceBaseName(new 
Path(file)));
-      LocalResource localResource = localizeResource(new Path(file),
-          hdfsFilePath, type, conf);
+      path = FileUtils.resolveSymlinks(path, conf);
+      Path hdfsFilePath = new Path(hdfsDirPathStr, getResourceBaseName(path));
+      LocalResource localResource = localizeResource(path, hdfsFilePath, type, 
conf);
       tmpResourcesMap.put(file, localResource);
     }
     return tmpResourcesMap;
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionState.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionState.java
index ad79fce936b..015e826d8e0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionState.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionState.java
@@ -47,6 +47,7 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hive.common.FileUtils;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.conf.HiveConfUtil;
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
@@ -92,6 +93,7 @@ import 
org.apache.hadoop.hive.ql.exec.tez.monitoring.TezJobMonitor;
 
 import com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.cache.Cache;
 import com.google.common.cache.CacheBuilder;
 
@@ -815,7 +817,8 @@ public class TezSessionState {
    * @throws LoginException when we are unable to determine the user.
    * @throws URISyntaxException when current jar location cannot be determined.
    */
-  private LocalResource createJarLocalResource(String localJarPath)
+  @VisibleForTesting
+  LocalResource createJarLocalResource(String localJarPath)
       throws IOException, LoginException, IllegalArgumentException {
     // TODO Reduce the number of lookups that happen here. This shouldn't go 
to HDFS for each call.
     // The hiveJarDir can be determined once per client.
@@ -823,7 +826,7 @@ public class TezSessionState {
     assert destDirStatus != null;
     Path destDirPath = destDirStatus.getPath();
 
-    Path localFile = new Path(localJarPath);
+    Path localFile = FileUtils.resolveSymlinks(new Path(localJarPath), conf);
     String sha = getSha(localFile);
 
     String destFileName = localFile.getName();
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestTezSessionState.java 
b/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestTezSessionState.java
new file mode 100644
index 00000000000..521134bdfa5
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestTezSessionState.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.tez;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.yarn.api.records.LocalResource;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestTezSessionState {
+
+  private class TestTezSessionPoolManager extends TezSessionPoolManager {
+    public TestTezSessionPoolManager() {
+      super();
+    }
+
+    @Override
+    public void setupPool(HiveConf conf) throws Exception {
+      super.setupPool(conf);
+    }
+
+    @Override
+    public TezSessionPoolSession createSession(String sessionId, HiveConf 
conf) {
+      return new SampleTezSessionState(sessionId, this, conf);
+    }
+  }
+
+  @Test
+  public void testSymlinkedLocalFilesAreLocalizedOnce() throws Exception {
+    Path jarPath = Files.createTempFile("jar", "");
+    Path symlinkPath = Paths.get(jarPath.toString() + ".symlink");
+    Files.createSymbolicLink(symlinkPath, jarPath);
+
+    // write some data into the fake jar, it's not a 0 length file in real life
+    Files.write(jarPath, "testSymlinkedLocalFilesToBeLocalized".getBytes(), 
StandardOpenOption.APPEND);
+
+    Assert.assertTrue(Files.isSymbolicLink(symlinkPath));
+
+    HiveConf hiveConf = new HiveConf();
+    hiveConf.set(HiveConf.ConfVars.HIVE_JAR_DIRECTORY.varname, "/tmp");
+    TezSessionPoolManager poolManager = new TestTezSessionPoolManager();
+
+    TezSessionState sessionState = poolManager.getSession(null, hiveConf, 
true, false);
+
+    LocalResource l1 = 
sessionState.createJarLocalResource(jarPath.toUri().toString());
+    LocalResource l2 = 
sessionState.createJarLocalResource(symlinkPath.toUri().toString());
+
+    // local resources point to the same original resource
+    Assert.assertEquals(l1.getResource().toPath(), l2.getResource().toPath());
+  }
+}
\ No newline at end of file

Reply via email to