JingsongLi commented on code in PR #7716:
URL: https://github.com/apache/paimon/pull/7716#discussion_r3151165851
##########
paimon-python/pypaimon/filesystem/pyarrow_file_io.py:
##########
@@ -256,13 +256,61 @@ def _initialize_hdfs_fs(self, scheme: str, netloc:
Optional[str]) -> FileSystem:
)
os.environ['CLASSPATH'] = class_paths.stdout.strip()
+ principal = (self.properties.get(SecurityOptions.KERBEROS_PRINCIPAL)
+ or self._get_property("security.principal"))
+ keytab = (self.properties.get(SecurityOptions.KERBEROS_KEYTAB)
+ or self._get_property("security.keytab"))
+ use_ticket_cache =
self.properties.get(SecurityOptions.KERBEROS_USE_TICKET_CACHE)
+
+ if bool(principal) != bool(keytab):
+ raise ValueError(
+ "security.kerberos.login.principal and
security.kerberos.login.keytab "
+ "must be both set or both unset")
+
host, port_str = splitport(netloc)
- return pafs.HadoopFileSystem(
- host=host,
- port=int(port_str),
- user=os.environ.get('HADOOP_USER_NAME', 'hadoop')
+ port = int(port_str) if port_str else 0
+
+ kerb_ticket = None
+ if principal and keytab:
+ self._kerberos_login_from_keytab(principal, keytab)
+ kerb_ticket = self._get_ticket_cache_path()
+ elif use_ticket_cache:
+ cache_path = self._get_ticket_cache_path()
+ if cache_path and os.path.exists(cache_path):
+ kerb_ticket = cache_path
+
+ if kerb_ticket:
+ return pafs.HadoopFileSystem(host=host, port=port,
kerb_ticket=kerb_ticket)
+ else:
+ return pafs.HadoopFileSystem(
+ host=host,
+ port=port,
+ user=os.environ.get('HADOOP_USER_NAME', 'hadoop')
+ )
+
+ @staticmethod
+ def _kerberos_login_from_keytab(principal: str, keytab: str):
+ if not os.path.isfile(keytab):
+ raise FileNotFoundError(f"Kerberos keytab file not found:
{keytab}")
+ if not os.access(keytab, os.R_OK):
+ raise PermissionError(f"Kerberos keytab file is not readable:
{keytab}")
+ subprocess.run(
+ ['kinit', '-kt', keytab, principal],
+ check=True, capture_output=True, text=True
)
+ @staticmethod
+ def _get_ticket_cache_path() -> Optional[str]:
+ cc = os.environ.get('KRB5CCNAME')
+ if cc:
+ if cc.startswith('FILE:'):
+ return cc[5:]
+ return cc
+ default_path = f'/tmp/krb5cc_{os.getuid()}'
+ if os.path.exists(default_path):
+ return default_path
+ return None
Review Comment:
After calling kinit, the code calls _get_ticket_cache_path() to find the
resulting ticket. But if KRB5CCNAME is not set and the default
`/tmp/krb5cc_{os.getuid()}` doesn't exist yet (e.g., first-time kinit on a
fresh container), _get_ticket_cache_path returns None, and the code silently
falls through to SIMPLE auth. This is a real bug — kinit succeeded but the
connection won't use Kerberos.
Suggestion: after a successful kinit, either trust that the default cache
path exists (skip the os.path.exists check) or raise an error if no ticket
cache is found.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]