Copilot commented on code in PR #6549:
URL: https://github.com/apache/hive/pull/6549#discussion_r3444844910
##########
serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerdeUtils.java:
##########
@@ -137,12 +141,24 @@ public static Schema
determineSchemaOrThrowException(Configuration conf, Propert
throw new AvroSerdeException(EXCEPTION_MESSAGE);
}
+ validateSchemaUrl(schemaString, conf);
+
try {
- Schema s = getSchemaFromFS(schemaString, conf);
- if (s == null) {
- //in case schema is not a file system
- return AvroSerdeUtils.getSchemaFor(new URL(schemaString));
+ Schema s;
+ if (requiresHttpFetch(schemaString)) {
+ URL url = new URL(schemaString);
+ try (InputStream in = url.openStream()) {
+ s = getSchemaParser().parse(in);
+ }
Review Comment:
When remote HTTP schema fetch is enabled, using URL.openStream() can follow
HTTP redirects, which can bypass the host allowlist (e.g. allowlisted host
redirects to a non-allowlisted host). Disable redirects (or re-validate after
redirects) when fetching the schema.
##########
ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/events/CreateTableEvent.java:
##########
@@ -65,18 +68,35 @@ private List<HivePrivilegeObject> getInputHObjs() {
Database database = event.getDatabase();
String uri = getSdLocation(table.getSd());
- if (StringUtils.isEmpty(uri)) {
- return ret;
+ if (StringUtils.isNotEmpty(uri)) {
+ // Skip DFS_URI only if table location is under default db path
+ if (this.needDFSUriAuth(uri, this.getDefaultTablePath(database, table)))
{
+ ret.add(new HivePrivilegeObject(HivePrivilegeObjectType.DFS_URI, uri));
+ }
}
- // Skip DFS_URI only if table location is under default db path
- if (this.needDFSUriAuth(uri, this.getDefaultTablePath(database, table))) {
- ret.add(new HivePrivilegeObject(HivePrivilegeObjectType.DFS_URI, uri));
- }
+ addAvroSchemaUrlInputAuth(ret, table, database);
return ret;
}
+ private void addAvroSchemaUrlInputAuth(List<HivePrivilegeObject> ret, Table
table, Database database) {
+ if
(!AvroSerDe.class.getName().equals(table.getSd().getSerdeInfo().getSerializationLib()))
{
+ return;
+ }
+ String schemaUrl =
table.getParameters().get(AvroTableProperties.SCHEMA_URL.getPropName());
+ if (StringUtils.isEmpty(schemaUrl) ||
AvroSerdeUtils.SCHEMA_NONE.equals(schemaUrl)) {
+ return;
+ }
+ if (!AvroSerdeUtils.isFilesystemSchemaUrl(schemaUrl)) {
+ return;
+ }
+ if (!needDFSUriAuth(schemaUrl, getDefaultTablePath(database, table))) {
+ return;
+ }
+ ret.add(getHivePrivilegeObjectDfsUri(schemaUrl));
+ }
Review Comment:
CreateTableEvent adds DFS_URI authorization for avro.schema.url even when
avro.schema.literal is present (in which case the schema is resolved from the
literal and the URL is ignored). Also, the default-table-path skip logic is
specific to table locations and can incorrectly suppress schema-url
authorization in the rare case the schema URL equals the default table path.
##########
serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerdeUtils.java:
##########
@@ -151,6 +167,118 @@ public static Schema
determineSchemaOrThrowException(Configuration conf, Propert
}
}
+ /**
+ * Validate that the avro.schema.url uses a permitted scheme and host.
+ */
+ static void validateSchemaUrl(String schemaUrl, Configuration conf) throws
AvroSerdeException {
+ final URI uri;
+ try {
+ uri = new URI(schemaUrl);
+ } catch (URISyntaxException e) {
+ throw new AvroSerdeException("Invalid avro.schema.url: " + schemaUrl, e);
+ }
+
+ final String scheme = uri.getScheme();
+ if (scheme == null || scheme.isEmpty()) {
+ return;
+ }
+
+ final String schemeLower = scheme.toLowerCase(Locale.ROOT);
+ if (HTTP_SCHEMES.contains(schemeLower)) {
Review Comment:
HiveConf docs for hive.avro.schema.url.allowed.schemes say file:// is never
allowed, but validateSchemaUrl currently only rejects schemes not in the
allowlist. If an operator accidentally adds "file" to the allowlist, file://
URLs would become permitted, which is a security risk.
##########
serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerdeUtils.java:
##########
@@ -151,6 +167,118 @@ public static Schema
determineSchemaOrThrowException(Configuration conf, Propert
}
}
+ /**
+ * Validate that the avro.schema.url uses a permitted scheme and host.
+ */
+ static void validateSchemaUrl(String schemaUrl, Configuration conf) throws
AvroSerdeException {
+ final URI uri;
+ try {
+ uri = new URI(schemaUrl);
+ } catch (URISyntaxException e) {
+ throw new AvroSerdeException("Invalid avro.schema.url: " + schemaUrl, e);
+ }
+
+ final String scheme = uri.getScheme();
+ if (scheme == null || scheme.isEmpty()) {
+ return;
+ }
+
+ final String schemeLower = scheme.toLowerCase(Locale.ROOT);
+ if (HTTP_SCHEMES.contains(schemeLower)) {
+ if (!HiveConf.getBoolVar(conf,
HiveConf.ConfVars.HIVE_AVRO_SCHEMA_URL_REMOTE_HTTP_ENABLED)) {
+ throw new AvroSerdeException("avro.schema.url scheme '" + scheme
+ + "' is not permitted. Remote HTTP schema fetch is disabled. Set "
+ +
HiveConf.ConfVars.HIVE_AVRO_SCHEMA_URL_REMOTE_HTTP_ENABLED.varname
+ + " to true and configure "
+ + HiveConf.ConfVars.HIVE_AVRO_SCHEMA_URL_HTTP_ALLOWED_HOSTS.varname
+ + ", or use avro.schema.literal instead.");
+ }
+ final String host = uri.getHost();
+ if (host == null || host.isEmpty()) {
+ throw new AvroSerdeException("avro.schema.url must specify a host for
HTTP/HTTPS schemas.");
+ }
+ if (!isHttpHostAllowed(host, conf)) {
+ throw new AvroSerdeException("avro.schema.url host '" + host
+ + "' is not in the permitted host list configured by "
+ +
HiveConf.ConfVars.HIVE_AVRO_SCHEMA_URL_HTTP_ALLOWED_HOSTS.varname + ".");
+ }
+ return;
+ }
+
+ if (!getAllowedFilesystemSchemes(conf).contains(schemeLower)) {
+ throw new AvroSerdeException("avro.schema.url scheme '" + scheme
+ + "' is not permitted. Use a Hadoop FileSystem URI (e.g. hdfs://,
s3a://) or "
+ + "avro.schema.literal instead.");
+ }
+ }
+
+ /**
+ * Returns true if the schema URL refers to a Hadoop filesystem location
(including scheme-less URIs).
+ */
+ public static boolean isFilesystemSchemaUrl(String schemaUrl) {
+ if (schemaUrl == null || schemaUrl.isEmpty() ||
SCHEMA_NONE.equals(schemaUrl)) {
+ return false;
+ }
+ try {
+ URI uri = new URI(schemaUrl);
+ String scheme = uri.getScheme();
+ if (scheme == null || scheme.isEmpty()) {
+ return true;
+ }
+ String schemeLower = scheme.toLowerCase(Locale.ROOT);
+ return !HTTP_SCHEMES.contains(schemeLower)
+ && getAllowedFilesystemSchemes(null).contains(schemeLower);
Review Comment:
isFilesystemSchemaUrl() uses the *default* allowed-schemes list
(getAllowedFilesystemSchemes(null)), ignoring any runtime configuration
overrides. This can cause filesystem schema URLs with non-default schemes to be
treated as non-filesystem and skip DFS_URI authorization, even though
AvroSerdeUtils.validateSchemaUrl() would allow them at runtime.
##########
ql/src/java/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/events/AlterTableEvent.java:
##########
@@ -122,6 +127,27 @@ private List<HivePrivilegeObject> getOutputHObjs() {
return ret;
}
+ private void addAvroSchemaUrlInputAuth(List<HivePrivilegeObject> ret,
PreAlterTableEvent event) {
+ Table newTable = event.getNewTable();
+ Table oldTable = event.getOldTable();
+ if
(!AvroSerDe.class.getName().equals(newTable.getSd().getSerdeInfo().getSerializationLib()))
{
+ return;
+ }
+ String newSchemaUrl =
newTable.getParameters().get(AvroTableProperties.SCHEMA_URL.getPropName());
+ if (StringUtils.isEmpty(newSchemaUrl) ||
AvroSerdeUtils.SCHEMA_NONE.equals(newSchemaUrl)) {
+ return;
+ }
Review Comment:
AlterTableEvent adds DFS_URI authorization for avro.schema.url even when
avro.schema.literal is present (in which case AvroSerde resolves from the
literal and the URL is ignored). This can cause unnecessary (and potentially
failing) authorization checks.
##########
ql/src/test/org/apache/hadoop/hive/ql/security/authorization/plugin/metastore/TestHiveMetaStoreAuthorizer.java:
##########
@@ -46,6 +46,9 @@
import
org.apache.hadoop.hive.ql.security.authorization.plugin.HiveOperationType;
import
org.apache.hadoop.hive.ql.security.authorization.plugin.HivePrivilegeObject;
import
org.apache.hadoop.hive.ql.security.authorization.plugin.metastore.filtercontext.TableFilterContext;
+import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils;
+import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils.AvroTableProperties;
+import org.apache.hadoop.hive.serde2.avro.AvroSerDe;
Review Comment:
TestHiveMetaStoreAuthorizer imports AvroSerdeUtils but doesn't use it;
unused imports can cause checkstyle failures.
##########
ql/src/java/org/apache/hadoop/hive/ql/security/authorization/AuthorizationUtils.java:
##########
@@ -17,11 +17,18 @@
*/
package org.apache.hadoop.hive.ql.security.authorization;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
import java.util.List;
+import java.util.Set;
+import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.HiveObjectPrivilege;
Review Comment:
AuthorizationUtils has an unused import (HiveConf) in this change; this can
fail builds when checkstyle forbids unused imports.
##########
serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroSerdeUtils.java:
##########
@@ -151,19 +153,103 @@ public void determineSchemaFindsLiterals() throws
Exception {
}
@Test
- public void determineSchemaTriesToOpenUrl() throws AvroSerdeException,
IOException {
+ public void determineSchemaRejectsUnknownSchemeUrl() throws
AvroSerdeException, IOException {
Configuration conf = new Configuration();
Properties props = new Properties();
props.put(AvroTableProperties.SCHEMA_URL.getPropName(),
"not:///a.real.url");
try {
AvroSerdeUtils.determineSchemaOrThrowException(conf, props);
- fail("Should have tried to open that URL");
+ fail("Should have rejected unknown scheme URL");
} catch (AvroSerdeException e) {
- assertEquals("Unable to read schema from given path: not:///a.real.url",
e.getMessage());
+ assertTrue(e.getMessage().contains("not permitted"));
}
}
+ @Test
+ public void determineSchemaRejectsHttpByDefault() throws IOException {
+ Configuration conf = new Configuration();
+ Properties props = new Properties();
+ props.put(AvroTableProperties.SCHEMA_URL.getPropName(),
+ "http://169.254.169.254/latest/meta-data/");
+
+ try {
+ determineSchemaOrThrowException(conf, props);
+ fail("Should have rejected HTTP schema URL");
+ } catch (AvroSerdeException e) {
+ assertTrue(e.getMessage().contains("not permitted"));
+ }
+ }
+
+ @Test
+ public void determineSchemaRejectsHttpsByDefault() throws IOException {
+ Configuration conf = new Configuration();
+ Properties props = new Properties();
+ props.put(AvroTableProperties.SCHEMA_URL.getPropName(),
"https://internal-service/admin");
+
+ try {
+ determineSchemaOrThrowException(conf, props);
+ fail("Should have rejected HTTPS schema URL");
+ } catch (AvroSerdeException e) {
+ assertTrue(e.getMessage().contains("not permitted"));
+ }
+ }
+
+ @Test
+ public void determineSchemaRejectsFileUrl() throws IOException {
+ Configuration conf = new Configuration();
+ Properties props = new Properties();
+ props.put(AvroTableProperties.SCHEMA_URL.getPropName(),
"file:///etc/passwd");
+
+ try {
+ determineSchemaOrThrowException(conf, props);
+ fail("Should have rejected file:// schema URL");
+ } catch (AvroSerdeException e) {
+ assertTrue(e.getMessage().contains("not permitted"));
+ }
+ }
+
+ @Test
+ public void determineSchemaRejectsHttpWhenHostNotAllowlisted() throws
IOException {
+ HiveConf conf = new HiveConf();
+
conf.setBoolVar(HiveConf.ConfVars.HIVE_AVRO_SCHEMA_URL_REMOTE_HTTP_ENABLED,
true);
+ conf.setVar(HiveConf.ConfVars.HIVE_AVRO_SCHEMA_URL_HTTP_ALLOWED_HOSTS,
"schema.example.com");
+ Properties props = new Properties();
+ props.put(AvroTableProperties.SCHEMA_URL.getPropName(),
+ "http://169.254.169.254/latest/meta-data/");
+
+ try {
+ determineSchemaOrThrowException(conf, props);
+ fail("Should have rejected HTTP host not in allowlist");
+ } catch (AvroSerdeException e) {
+ assertTrue(e.getMessage().contains("not in the permitted host list"));
+ }
+ }
+
+ @Test
+ public void determineSchemaAllowsHttpWhenHostAllowlisted() throws
IOException {
+ HiveConf conf = new HiveConf();
+
conf.setBoolVar(HiveConf.ConfVars.HIVE_AVRO_SCHEMA_URL_REMOTE_HTTP_ENABLED,
true);
+ conf.setVar(HiveConf.ConfVars.HIVE_AVRO_SCHEMA_URL_HTTP_ALLOWED_HOSTS,
"schema.example.com");
+ Properties props = new Properties();
+ props.put(AvroTableProperties.SCHEMA_URL.getPropName(),
"http://schema.example.com/schema.avsc");
+
Review Comment:
This test uses an external DNS name (schema.example.com) and may perform a
real network/DNS lookup when HTTP fetch is allowlisted, which can make unit
tests flaky or slow depending on the CI environment. Prefer a loopback host +
an unreachable port to force a fast, deterministic failure.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]