This is an automated email from the ASF dual-hosted git repository.
ianmcook pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-experiments.git
The following commit(s) were added to refs/heads/main by this push:
new 559d7e4 Use HTTP/1.1 and implement chunked transfer encoding in
simple Python server example (#12)
559d7e4 is described below
commit 559d7e480dd17e4eda664bc1fe4714125f0b0d39
Author: Ian Cook <[email protected]>
AuthorDate: Sun Mar 10 21:52:19 2024 -0400
Use HTTP/1.1 and implement chunked transfer encoding in simple Python
server example (#12)
* Use HTTP/1.1 in Python server example
* Interrupt gracefully
* Improve comments
* Implement chunked transfer encoding in Python server example
* Improve READMEs
* Update >2 GB workaround
* Support HTTP/1.0 and HTTP/1.1
* Formatting
---
http/get_simple/README.md | 2 ++
http/get_simple/python/server/README.md | 3 ++
http/get_simple/python/server/server.py | 59 ++++++++++++++++++++++++++-------
3 files changed, 52 insertions(+), 12 deletions(-)
diff --git a/http/get_simple/README.md b/http/get_simple/README.md
index 65738a2..e641b0c 100644
--- a/http/get_simple/README.md
+++ b/http/get_simple/README.md
@@ -23,6 +23,8 @@ This directory contains a set of minimal examples of HTTP
clients and servers im
- How a client can send a GET request to a server and receive a response from
the server containing an Arrow IPC stream of record batches.
- How a server can respond to a GET request from a client and send the client
a response containing an Arrow IPC stream of record batches.
+The examples here assume that the server cannot determine the exact length in
bytes of the full Arrow IPC stream before sending it, so they cannot set the
`Content-Length` header or serve Range requests.
+
To enable performance comparisons to Arrow Flight RPC, the server examples
generate the data in exactly the same way as in
[`flight_benchmark.cc`](https://github.com/apache/arrow/blob/7346bdffbdca36492089f6160534bfa2b81bad90/cpp/src/arrow/flight/flight_benchmark.cc#L194-L245)
as cited in the [original blog post introducing Flight
RPC](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/). But
note that Flight example sends four concurrent streams.
If you are collaborating on the set of examples in this directory, please
follow these guidelines:
diff --git a/http/get_simple/python/server/README.md
b/http/get_simple/python/server/README.md
index 18bc738..ee45903 100644
--- a/http/get_simple/python/server/README.md
+++ b/http/get_simple/python/server/README.md
@@ -30,3 +30,6 @@ To run this example:
pip install pyarrow
python server.py
```
+
+> [!NOTE]
+> This example uses Python's built-in
[`http.server`](https://docs.python.org/3/library/http.server.html) module.
This server does not implement chunked transfer encoding automatically like
more sophisticated HTTP servers do, so this example implements it manually,
with each chunk consisting of one Arrow record batch. Note that in servers that
implement chunked transfer encoding automatically, each chunk will generally
not correspond to one Arrow record batch.
diff --git a/http/get_simple/python/server/server.py
b/http/get_simple/python/server/server.py
index 0f4469d..03113fc 100644
--- a/http/get_simple/python/server/server.py
+++ b/http/get_simple/python/server/server.py
@@ -20,6 +20,9 @@ from random import randbytes
from http.server import BaseHTTPRequestHandler, HTTPServer
import io
+# use chunked transfer encoding?
+chunked_encoding = True
+
schema = pa.schema([
('a', pa.int64()),
('b', pa.int64()),
@@ -58,8 +61,6 @@ def make_reader(schema, batches):
def generate_batches(schema, reader):
with io.BytesIO() as sink, pa.ipc.new_stream(sink, schema) as writer:
- yield sink.getvalue()
-
for batch in reader:
sink.seek(0)
sink.truncate(0)
@@ -73,37 +74,71 @@ def generate_batches(schema, reader):
class MyServer(BaseHTTPRequestHandler):
def do_GET(self):
+
+ if self.request_version == 'HTTP/1.0':
+ self.protocol_version = 'HTTP/1.0'
+ chunked = False
+ else:
+ self.protocol_version = 'HTTP/1.1'
+ chunked = chunked_encoding
+
+ self.close_connection = True
self.send_response(200)
self.send_header('Content-Type', 'application/vnd.apache.arrow.stream')
- # set these headers if testing with a local browser-based client:
-
+ ### set these headers if testing with a local browser-based client:
#self.send_header('Access-Control-Allow-Origin',
'http://localhost:8000')
#self.send_header('Access-Control-Allow-Methods', 'GET')
#self.send_header('Access-Control-Allow-Headers', 'Content-Type')
+ ### set this header to make browsers download the file with a name and
extension:
+ #self.send_header('Content-Disposition', 'attachment;
filename="data.arrows"')
+
+ if chunked:
+ self.send_header('Transfer-Encoding', 'chunked')
+
self.end_headers()
for buffer in generate_batches(schema, make_reader(schema, batches)):
+ if chunked:
+
self.wfile.write('{:X}\r\n'.format(len(buffer)).encode('utf-8'))
self.wfile.write(buffer)
+ if chunked:
+ self.wfile.write('\r\n'.encode('utf-8'))
self.wfile.flush()
- # if any record batch could be larger than 2 GB, split it
- # into chunks before passing to self.wfile.write() by
- # replacing the two lines above with this:
-
+ ### if any record batch could be larger than 2 GB, Python's
+ ### http.server will error when calling self.wfile.write(),
+ ### so you will need to split them into smaller chunks by
+ ### replacing the six lines above with this:
#chunk_size = int(2e9)
#chunk_splits = len(buffer) // chunk_size
#for i in range(chunk_splits):
+ # if chunked:
+ #
self.wfile.write('{:X}\r\n'.format(chunk_size).encode('utf-8'))
# self.wfile.write(buffer[i * chunk_size:i * chunk_size +
chunk_size])
+ # if chunked:
+ # self.wfile.write('\r\n'.encode('utf-8'))
# self.wfile.flush()
+ #last_chunk_size = len(buffer) - (chunk_splits * chunk_size)
+ #if chunked:
+ #
self.wfile.write('{:X}\r\n'.format(last_chunk_size).encode('utf-8'))
#self.wfile.write(buffer[chunk_splits * chunk_size:])
+ #if chunked:
+ # self.wfile.write('\r\n'.encode('utf-8'))
#self.wfile.flush()
+
+ if chunked:
+ self.wfile.write('0\r\n\r\n'.encode('utf-8'))
+ self.wfile.flush()
batches = GetPutData()
server_address = ('localhost', 8000)
-httpd = HTTPServer(server_address, MyServer)
-
-print(f'Serving on {server_address[0]}:{server_address[1]}...')
-httpd.serve_forever()
+try:
+ httpd = HTTPServer(server_address, MyServer)
+ print(f'Serving on {server_address[0]}:{server_address[1]}...')
+ httpd.serve_forever()
+except KeyboardInterrupt:
+ print('Shutting down server')
+ httpd.socket.close()