Colin Watson has proposed merging ~cjwatson/launchpad:librarian-timeouts into 
launchpad:master.

Commit message:
Add a socket timeout for librarian client connections

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/433219

The internal librarian client creates a socket with no timeout, so if anything 
goes wrong with the connection then it only recovers by being killed manually.  
Add a socket timeout so that it gives up eventually.  (Note that this timeout 
applies to individual operations on the socket, not to uploading or downloading 
entire files.)

The default timeout is deliberately quite generous to minimize the chance of 
causing failures due to connections being merely slow rather than stuck, 
although we may still find we need to tune it.  This isn't intended to ensure 
that librarian requests fit within timeout budgets set for web requests or 
similar, but just as a backstop to ensure that we don't get stuck waiting 
forever.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of 
~cjwatson/launchpad:librarian-timeouts into launchpad:master.
diff --git a/lib/lp/services/config/schema-lazr.conf b/lib/lp/services/config/schema-lazr.conf
index 640865d..ca906a6 100644
--- a/lib/lp/services/config/schema-lazr.conf
+++ b/lib/lp/services/config/schema-lazr.conf
@@ -1269,6 +1269,10 @@ authentication_endpoint: none
 # authserver.
 authentication_timeout: 5
 
+# The timeout in seconds for librarian client socket operations.
+# datatype: integer
+client_socket_timeout: 60
+
 
 [librarian_gc]
 # The database user which will be used by this process.
diff --git a/lib/lp/services/librarian/client.py b/lib/lp/services/librarian/client.py
index 7be460f..083074a 100644
--- a/lib/lp/services/librarian/client.py
+++ b/lib/lp/services/librarian/client.py
@@ -83,6 +83,7 @@ class FileUploadClient:
         """
         try:
             self.state.s = socket.socket(AF_INET, SOCK_STREAM)
+            self.state.s.settimeout(config.librarian.client_socket_timeout)
             self.state.s.connect((self.upload_host, self.upload_port))
             self.state.f = self.state.s.makefile("rwb", 0)
 
@@ -96,13 +97,16 @@ class FileUploadClient:
 
     def _close(self):
         """Close connection"""
-        self.state.s_poll.unregister(self.state.s.fileno())
-        self.state.s_poll.close()
-        del self.state.s_poll
-        self.state.s.close()
-        del self.state.s
-        self.state.f.close()
-        del self.state.f
+        if hasattr(self.state, "s_poll"):
+            self.state.s_poll.unregister(self.state.s.fileno())
+            self.state.s_poll.close()
+            del self.state.s_poll
+        if hasattr(self.state, "s"):
+            self.state.s.close()
+            del self.state.s
+        if hasattr(self.state, "f"):
+            self.state.f.close()
+            del self.state.f
 
     def _checkError(self):
         poll_result = self.state.s_poll.poll(self.s_poll_timeout)
@@ -171,8 +175,9 @@ class FileUploadClient:
             LibraryFileContent,
         )
 
-        self._connect()
         try:
+            self._connect()
+
             # Get the name of the database the client is using, so that
             # the server can check that the client is using the same
             # database as the server.
@@ -259,6 +264,12 @@ class FileUploadClient:
                 aliasID,
             )
             return aliasID
+        except socket.timeout:
+            timeout = config.librarian.client_socket_timeout
+            raise UploadFailed(
+                "Server timed out after %s %s"
+                % (timeout, "second" if timeout == 1 else "seconds")
+            )
         finally:
             self._close()
 
diff --git a/lib/lp/services/librarian/tests/test_client.py b/lib/lp/services/librarian/tests/test_client.py
index aec3fb9..b5d3984 100644
--- a/lib/lp/services/librarian/tests/test_client.py
+++ b/lib/lp/services/librarian/tests/test_client.py
@@ -32,6 +32,7 @@ from lp.services.librarian.client import (
 from lp.services.librarian.interfaces.client import UploadFailed
 from lp.services.librarian.model import LibraryFileAlias
 from lp.services.propertycache import cachedproperty
+from lp.services.timeout import override_timeout, with_timeout
 from lp.testing import TestCase
 from lp.testing.layers import (
     DatabaseLayer,
@@ -231,6 +232,22 @@ class EchoServer(threading.Thread):
         self.socket.close()
 
 
+class TimingOutServer(EchoServer):
+    """Fake TCP server that never sends a reply."""
+
+    def run(self):
+        while not self.should_stop:
+            try:
+                conn, addr = self.socket.accept()
+                self.connections.append(conn)
+            except socket.timeout:
+                pass
+        for conn in self.connections:
+            conn.close()
+        self.connections = []
+        self.socket.close()
+
+
 class LibrarianClientTestCase(TestCase):
     layer = LaunchpadFunctionalLayer
 
@@ -307,6 +324,36 @@ class LibrarianClientTestCase(TestCase):
             "text/plain",
         )
 
+    def test_addFile_fails_when_server_times_out(self):
+        server = TimingOutServer()
+        server.start()
+        self.addCleanup(server.join)
+
+        upload_host, upload_port = server.socket.getsockname()
+        self.pushConfig(
+            "librarian",
+            upload_host=upload_host,
+            upload_port=upload_port,
+            client_socket_timeout=1,
+        )
+
+        client = LibrarianClient()
+
+        @with_timeout()
+        def add_file():
+            self.assertRaisesWithContent(
+                UploadFailed,
+                "Server timed out after 1 second",
+                client.addFile,
+                "sample.txt",
+                6,
+                io.BytesIO(b"sample"),
+                "text/plain",
+            )
+
+        with override_timeout(3600):
+            add_file()
+
     def test_addFile_uses_primary(self):
         # addFile is a write operation, so it should always use the
         # primary store, even if the standby is the default. Close the
_______________________________________________
Mailing list: https://launchpad.net/~launchpad-reviewers
Post to     : [email protected]
Unsubscribe : https://launchpad.net/~launchpad-reviewers
More help   : https://help.launchpad.net/ListHelp

Reply via email to