This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3bdf1d264809cc903aa25d461f48cd7885c08604
Author: Andrew Sherman <[email protected]>
AuthorDate: Mon Aug 19 14:45:49 2024 -0700

    IMPALA-13310 Add the value of the http 'X-Forwarded-For' header to the 
runtime profile
    
    When using hs2-http protocol, http messages from Impala clients may pass
    through one or more proxies before reaching the Impala coordinator.
    This can make it harder to track the origin of the http messages. The
    'X-Forwarded-For' header is added to or edited by HTTP proxies when
    forwarding a request, so it may contain multiple source addresses. Add
    the value of this header to the runtime profile so that it can be
    observed.
    
    Impala will truncate the 'X-Forwarded-For' header value at 8096
    characters. Apart from this, Impala does not do any verification or
    sanitization of this value, so its value should only be trusted if the
    deployment environment protects against spoofing.
    
    A good reference for understanding the use of 'X-Forwarded-For' is
    https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
    
    This patch does not address the cases where http proxies insert
    multiple 'X-Forwarded-For' headers. This issue is tracked in
    IMPALA-13335.
    
    TESTING: add an option '--hs2_x_forward' to impala-shell which will
    set the 'X-Forwarded-For' header. Add tests which verify that the value
    is set in the profile, and that a long value is truncated correctly.
    
    Change-Id: I2e010cfb09674c5d043ef915347c3836696e03cf
    Reviewed-on: http://gerrit.cloudera.org:8080/21700
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/rpc/authentication.cc           | 11 +++++++++++
 be/src/rpc/thrift-server.h             |  3 +++
 be/src/service/client-request-state.cc |  5 +++++
 be/src/service/impala-hs2-server.cc    |  1 +
 be/src/service/impala-server.h         |  4 ++++
 be/src/transport/THttpServer.cpp       | 11 ++++++++++-
 be/src/transport/THttpServer.h         |  9 +++++++++
 shell/impala_client.py                 |  6 +++++-
 shell/impala_shell.py                  |  8 +++++---
 shell/option_parser.py                 |  4 ++++
 tests/shell/test_shell_commandline.py  | 30 ++++++++++++++++++++++++++++++
 11 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/be/src/rpc/authentication.cc b/be/src/rpc/authentication.cc
index 7fc8c2c74..21031e52b 100644
--- a/be/src/rpc/authentication.cc
+++ b/be/src/rpc/authentication.cc
@@ -636,6 +636,12 @@ bool GetUsernameFromBasicAuthHeader(
   return true;
 }
 
+bool SetOrigin(
+    ThriftServer::ConnectionContext* connection_context, const std::string& 
origin) {
+  connection_context->http_origin = origin;
+  return false;
+}
+
 bool TrustedDomainCheck(ThriftServer::ConnectionContext* connection_context,
     const AuthenticationHash& hash, const std::string& origin, string 
auth_header) {
   if (!IsTrustedDomain(origin, FLAGS_trusted_domain,
@@ -1429,6 +1435,8 @@ void SecureAuthProvider::SetupConnectionContext(
         callbacks.trusted_auth_header_handle_fn = std::bind(
             HandleTrustedAuthHeader, connection_ptr.get(), hash_, 
std::placeholders::_1);
       }
+      callbacks.set_http_origin_fn = std::bind(SetOrigin, connection_ptr.get(),
+          std::placeholders::_1);
       http_input_transport->setCallbacks(callbacks);
       http_output_transport->setCallbacks(callbacks);
       socket = 
down_cast<TSocket*>(http_input_transport->getUnderlyingTransport().get());
@@ -1490,9 +1498,12 @@ void NoAuthProvider::SetupConnectionContext(
           HttpPathFn, connection_ptr.get(), "", std::placeholders::_1,
           std::placeholders::_2, std::placeholders::_3);
       callbacks.return_headers_fn = std::bind(ReturnHeaders, 
connection_ptr.get());
+      callbacks.set_http_origin_fn = std::bind(SetOrigin, connection_ptr.get(),
+          std::placeholders::_1);
       http_input_transport->setCallbacks(callbacks);
       http_output_transport->setCallbacks(callbacks);
       socket = 
down_cast<TSocket*>(http_input_transport->getUnderlyingTransport().get());
+
       break;
     }
     default:
diff --git a/be/src/rpc/thrift-server.h b/be/src/rpc/thrift-server.h
index da2b21180..ddb1917de 100644
--- a/be/src/rpc/thrift-server.h
+++ b/be/src/rpc/thrift-server.h
@@ -106,6 +106,9 @@ class ThriftServer {
     Username username;
     Username do_as_user;
     TNetworkAddress network_address;
+    /// If using hs2-http protocol, this is the origin of the session
+    /// as recorded in the X-Forwarded-For http message header.
+    std::string http_origin;
     std::string server_name;
     /// Used to pass HTTP headers generated by the input transport to the 
output transport
     /// to be returned.
diff --git a/be/src/service/client-request-state.cc 
b/be/src/service/client-request-state.cc
index a4a84ca7b..c91037392 100644
--- a/be/src/service/client-request-state.cc
+++ b/be/src/service/client-request-state.cc
@@ -175,6 +175,11 @@ ClientRequestState::ClientRequestState(const TQueryCtx& 
query_ctx, Frontend* fro
   summary_profile_->AddInfoString("Delegated User", do_as_user());
   summary_profile_->AddInfoString("Network Address",
       TNetworkAddressToString(session_->network_address));
+  if (!session_->http_origin.empty()) {
+    /// If using hs2-http protocol, this is the origin of the session
+    /// as recorded in the X-Forwarded-For http message header.
+    summary_profile_->AddInfoString("Http Origin", session_->http_origin);
+  }
   summary_profile_->AddInfoString("Default Db", default_db());
   summary_profile_->AddInfoStringRedacted(
       "Sql Statement", query_ctx_.client_request.stmt);
diff --git a/be/src/service/impala-hs2-server.cc 
b/be/src/service/impala-hs2-server.cc
index ab3cce89d..8d3dab78e 100644
--- a/be/src/service/impala-hs2-server.cc
+++ b/be/src/service/impala-hs2-server.cc
@@ -343,6 +343,7 @@ void ImpalaServer::OpenSession(TOpenSessionResp& return_val,
     state->session_type = TSessionType::HIVESERVER2;
   }
   state->network_address = 
ThriftServer::GetThreadConnectionContext()->network_address;
+  state->http_origin = ThriftServer::GetThreadConnectionContext()->http_origin;
   state->last_accessed_ms = UnixMillis();
   // request.client_protocol is not guaranteed to be a valid 
TProtocolVersion::type, so
   // loading it can cause undefined behavior. Instead, we copy it to a value 
of the
diff --git a/be/src/service/impala-server.h b/be/src/service/impala-server.h
index 0dba61c5c..197fb3330 100644
--- a/be/src/service/impala-server.h
+++ b/be/src/service/impala-server.h
@@ -619,6 +619,10 @@ class ImpalaServer : public ImpalaServiceIf,
     /// Client network address.
     TNetworkAddress network_address;
 
+    /// If using hs2-http protocol, this is the origin of the session
+    /// as recorded in the X-Forwarded-For http message header.
+    std::string http_origin;
+
     /// Protects all fields below. See "Locking" in the class comment for lock
     /// acquisition order.
     std::mutex lock;
diff --git a/be/src/transport/THttpServer.cpp b/be/src/transport/THttpServer.cpp
index 3a1f5d60a..26481e781 100644
--- a/be/src/transport/THttpServer.cpp
+++ b/be/src/transport/THttpServer.cpp
@@ -267,6 +267,15 @@ void THttpServer::headersDone() {
         << (header_x_query_id_.empty() ? "" : " x-query-id=" + 
header_x_query_id_);
   }
 
+  // Trim and truncate the value of the 'X-Forwarded-For' header.
+  string origin = origin_;
+  StripWhiteSpace(&origin);
+  if (origin.length() > MAX_X_FORWARDED_HEADER_LENGTH) {
+    origin = origin.substr(0, MAX_X_FORWARDED_HEADER_LENGTH);
+  }
+  // Store the truncated value of the 'X-Forwarded-For' header in the 
Connection Context.
+  callbacks_.set_http_origin_fn(origin);
+
   if (!has_ldap_ && !has_kerberos_ && !has_saml_ && !has_jwt_) {
     // We don't need to authenticate.
     resetAuthState();
@@ -350,7 +359,7 @@ void THttpServer::headersDone() {
 
     if (!authorized && !fallback_to_other_auths) {
       // Do not fallback to other auth mechanisms, as the client probably 
expects
-      // only SAML related respsonses.
+      // only SAML related responses.
       if (metrics_enabled_) 
http_metrics_->total_saml_auth_failure_->Increment(1);
       resetAuthState();
       returnUnauthorized();
diff --git a/be/src/transport/THttpServer.h b/be/src/transport/THttpServer.h
index acd877443..d86e3248a 100644
--- a/be/src/transport/THttpServer.h
+++ b/be/src/transport/THttpServer.h
@@ -106,6 +106,11 @@ public:
     std::function<bool(const std::string&, std::string)> 
trusted_domain_check_fn =
         [&](const std::string&, std::string) { return false; };
 
+    // Function that stores the connection's 'X-Forwarded-For' header in the 
Connection
+    // Context so that it can be tracked.
+    std::function<bool(std::string)> set_http_origin_fn =
+        [&](const std::string) { return false; };
+
     // Function that takes the connection's 'Authorization' header and returns 
true if
     // the basic auth header contains a valid username.
     std::function<bool(std::string)> trusted_auth_header_handle_fn = 
[&](std::string) {
@@ -248,6 +253,10 @@ protected:
 
   // The value from the 'X-Impala-Query-Id' header.
   std::string header_x_query_id_ = "";
+
+  // The maximum length of the 'X-Forwarded-For' header that will be stored in 
the runtime
+  // Profile.
+  static const int MAX_X_FORWARDED_HEADER_LENGTH = 8096;
 };
 
 /**
diff --git a/shell/impala_client.py b/shell/impala_client.py
index 962d8c7ad..39e8d397f 100755
--- a/shell/impala_client.py
+++ b/shell/impala_client.py
@@ -137,7 +137,7 @@ class ImpalaClient(object):
                verbose=True, use_http_base_transport=False, http_path=None,
                http_cookie_names=None, http_socket_timeout_s=None, 
value_converter=None,
                connect_max_tries=4, rpc_stdout=False, rpc_file=None, 
http_tracing=True,
-               jwt=None):
+               jwt=None, hs2_x_forward=None):
     self.connected = False
     self.impalad_host = impalad[0]
     self.impalad_port = int(impalad[1])
@@ -172,6 +172,8 @@ class ImpalaClient(object):
     self.value_converter = value_converter
     self.rpc_stdout = rpc_stdout
     self.rpc_file = rpc_file
+    # In h2s-http clients only, the value of the X-Forwarded-For http header.
+    self.hs2_x_forward = hs2_x_forward
 
   def connect(self):
     """Creates a connection to an Impalad instance. Returns a tuple with the 
impala
@@ -748,6 +750,8 @@ class ImpalaHS2Client(ImpalaClient):
       assert getattr(self, "_current_request_id", None) is not None, \
         "request id was not set"
       headers["X-Request-Id"] = self._current_request_id
+    if self.hs2_x_forward:
+      headers["X-Forwarded-For"] = self.hs2_x_forward
 
     return headers
 
diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index 982525bd7..34dd83783 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py
@@ -258,7 +258,7 @@ class ImpalaShell(cmd.Cmd, object):
     self.orig_cmd = None
 
     # Tracks query handle of the last query executed. Used by the 'profile' 
command.
-    self.last_query_handle = None;
+    self.last_query_handle = None
 
     # live_summary and live_progress are turned off in strict_hs2_protocol mode
     if options.strict_hs2_protocol:
@@ -282,6 +282,7 @@ class ImpalaShell(cmd.Cmd, object):
     self.fetch_size = options.fetch_size
     self.http_cookie_names = options.http_cookie_names
     self.http_tracing = not options.no_http_tracing
+    self.hs2_x_forward = options.hs2_x_forward
 
     # Due to a readline bug in centos/rhel7, importing it causes control 
characters to be
     # printed. This breaks any scripting against the shell in non-interactive 
mode. Since
@@ -647,7 +648,7 @@ class ImpalaShell(cmd.Cmd, object):
                           http_cookie_names=self.http_cookie_names,
                           value_converter=value_converter, 
rpc_stdout=self.rpc_stdout,
                           rpc_file=self.rpc_file, 
http_tracing=self.http_tracing,
-                          jwt=self.jwt)
+                          jwt=self.jwt, hs2_x_forward=self.hs2_x_forward)
     if protocol == 'hs2':
       return ImpalaHS2Client(self.impalad, self.fetch_size, 
self.kerberos_host_fqdn,
                           self.use_kerberos, self.kerberos_service_name, 
self.use_ssl,
@@ -668,7 +669,8 @@ class ImpalaShell(cmd.Cmd, object):
                           value_converter=value_converter,
                           connect_max_tries=self.connect_max_tries,
                           rpc_stdout=self.rpc_stdout, rpc_file=self.rpc_file,
-                          http_tracing=self.http_tracing, jwt=self.jwt)
+                          http_tracing=self.http_tracing, jwt=self.jwt,
+                          hs2_x_forward=self.hs2_x_forward)
     elif protocol == 'beeswax':
       return ImpalaBeeswaxClient(self.impalad, self.fetch_size, 
self.kerberos_host_fqdn,
                           self.use_kerberos, self.kerberos_service_name, 
self.use_ssl,
diff --git a/shell/option_parser.py b/shell/option_parser.py
index a587596ce..6f9c9669e 100755
--- a/shell/option_parser.py
+++ b/shell/option_parser.py
@@ -352,6 +352,10 @@ def get_option_parser(defaults):
                     "values when using the HS2 protocol. The default behaviour 
makes the "
                     "values handled by Python's str() built-in method. Use 
'16G' to "
                     "match the Beeswax protocol's floating-point output 
format.")
+  parser.add_option("--hs2_x_forward", type="str",
+                    dest="hs2_x_forward", default=None,
+                    help="When using the hs2-http protocol, set this value in 
the "
+                    "X-Forwarded-For header. This is primarily for testing 
purposes.")
 
   # add default values to the help text
   for option in parser.option_list:
diff --git a/tests/shell/test_shell_commandline.py 
b/tests/shell/test_shell_commandline.py
index 03ea7b96a..401560473 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -1553,6 +1553,36 @@ class TestImpalaShell(ImpalaTestSuite):
 
     assert expected_py2 in result.stdout or expected_py3 in result.stdout
 
+  def test_hs2_x_forward(self, vector):
+    """Test that when we use the --hs2_x_forward flag with hs2-http
+    protocol then the X-Forwarded-For http header is set, and this
+    value is reported in the 'Http Origin' field of the profile."""
+    if vector.get_value('strict_hs2_protocol'):
+      pytest.skip("Skip as no profile with strict protocol.")
+    expect_header = vector.get_value('protocol') == 'hs2-http'
+    args = ['-q', 'select 1;profile;',
+            '--hs2_x_forward=a_forwarded_origin_header 
b_forwarded_origin_header']
+    result = run_impala_shell_cmd(vector, args)
+    found_http_origin = ("Http Origin: a_forwarded_origin_header "
+                         "b_forwarded_origin_header") in result.stdout
+    assert found_http_origin == expect_header
+
+  def test_hs2_x_forward_long(self, vector):
+    """Test that when we use the --hs2_x_forward flag with hs2-http
+    protocol then when passing a very long string, the 'Http Origin'
+    field of the profile is truncated."""
+    if vector.get_value('strict_hs2_protocol'):
+      pytest.skip("Skip as no profile with strict protocol.")
+    expect_header = vector.get_value('protocol') == 'hs2-http'
+    long_string = 'a' * 10000
+    expected_string = 'a' * 8096
+    args = ['-q', 'select 1;profile;',
+            '--hs2_x_forward={0}'.format(long_string)]
+    result = run_impala_shell_cmd(vector, args)
+    found_http_origin = ("Http Origin: {0}".format(expected_string)) in 
result.stdout
+    assert found_http_origin == expect_header
+    assert long_string not in result.stdout
+
   def test_output_rpc_to_screen_and_file(self, vector, populated_table, 
tmp_file):
     """Tests the flags that output hs2 rpc call details to both stdout
     and a file.  Asserts the expected text is written."""

Reply via email to