Chad Smith has proposed merging ~chad.smith/cloud-init:bug/1800223-retry-imds-on-timeout into cloud-init:master.
Commit message: azure: retry imds polling on requests.Timeout There is an infrequent race when the booting instance can hit the IMDS service before it is fully available. This results in a requests.ConnectTimeout being raised. Azure's retry_callback logic now retries on either 404s or Timeouts. LP:1800223 Requested reviews: Server Team CI bot (server-team-bot): continuous-integration cloud-init commiters (cloud-init-dev) For more details, see: https://code.launchpad.net/~chad.smith/cloud-init/+git/cloud-init/+merge/358112 -- Your team cloud-init commiters is requested to review the proposed merge of ~chad.smith/cloud-init:bug/1800223-retry-imds-on-timeout into cloud-init:master.
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py index d0358e9..aaa705c 100644 --- a/cloudinit/sources/DataSourceAzure.py +++ b/cloudinit/sources/DataSourceAzure.py @@ -12,6 +12,7 @@ import json import os import os.path import re +import requests from time import time from xml.dom import minidom import xml.etree.ElementTree as ET @@ -514,7 +515,10 @@ class DataSourceAzure(sources.DataSource): def exc_cb(msg, exception): if isinstance(exception, UrlError) and exception.code == 404: - return True + return True # Continue retries + cause = exception.cause + if cause and isinstance(cause, requests.Timeout): + return True # Continue retries # If we get an exception while trying to call IMDS, we # call DHCP and setup the ephemeral network to acquire the new IP. return False @@ -1170,8 +1174,12 @@ def get_metadata_from_imds(fallback_nic, retries): def _get_metadata_from_imds(retries): def retry_on_url_error(msg, exception): - if isinstance(exception, UrlError) and exception.code == 404: - return True # Continue retries + if isinstance(exception, UrlError): + if exception.code == 404: + return True # Continue retries + cause = exception.cause + if cause and isinstance(cause, requests.Timeout): + return True # Continue retries return False # Stop retries on all other exceptions url = IMDS_URL + "instance?api-version=2017-12-01" diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py index cd6e7e7..929aa2e 100644 --- a/tests/unittests/test_datasource/test_azure.py +++ b/tests/unittests/test_datasource/test_azure.py @@ -17,6 +17,7 @@ import crypt import httpretty import json import os +import requests import stat import xml.etree.ElementTree as ET import yaml @@ -184,6 +185,35 @@ class TestGetMetadataFromIMDS(HttprettyTestCase): "Crawl of Azure Instance Metadata Service (IMDS) took", # log_time self.logs.getvalue()) + @mock.patch('requests.Session.request') + @mock.patch('cloudinit.url_helper.time.sleep') + @mock.patch(MOCKPATH + 'net.is_up') + def test_get_metadata_from_imds_retries_on_timeout( + self, m_net_is_up, m_sleep, m_request): + """Retry IMDS network metadata on timeout errors.""" + + self.attempt = 0 + m_request.side_effect = requests.Timeout('Fake Connection Timeout') + + def retry_callback(request, uri, headers): + self.attempt += 1 + raise requests.Timeout('Fake connection timeout') + + httpretty.register_uri( + httpretty.GET, + dsaz.IMDS_URL + 'instance?api-version=2017-12-01', + body=retry_callback) + + m_net_is_up.return_value = True # skips dhcp + + self.assertEqual({}, dsaz.get_metadata_from_imds('eth9', retries=3)) + + m_net_is_up.assert_called_with('eth9') + self.assertEqual([mock.call(1)]*3, m_sleep.call_args_list) + self.assertIn( + "Crawl of Azure Instance Metadata Service (IMDS) took", # log_time + self.logs.getvalue()) + class TestAzureDataSource(CiTestCase):
_______________________________________________ Mailing list: https://launchpad.net/~cloud-init-dev Post to : cloud-init-dev@lists.launchpad.net Unsubscribe : https://launchpad.net/~cloud-init-dev More help : https://help.launchpad.net/ListHelp