This is an automated email from the ASF dual-hosted git repository.

ianmcook pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-experiments.git


The following commit(s) were added to refs/heads/main by this push:
     new 559d7e4  Use HTTP/1.1 and implement chunked transfer encoding in 
simple Python server example (#12)
559d7e4 is described below

commit 559d7e480dd17e4eda664bc1fe4714125f0b0d39
Author: Ian Cook <[email protected]>
AuthorDate: Sun Mar 10 21:52:19 2024 -0400

    Use HTTP/1.1 and implement chunked transfer encoding in simple Python 
server example (#12)
    
    * Use HTTP/1.1 in Python server example
    
    * Interrupt gracefully
    
    * Improve comments
    
    * Implement chunked transfer encoding in Python server example
    
    * Improve READMEs
    
    * Update >2 GB workaround
    
    * Support HTTP/1.0 and HTTP/1.1
    
    * Formatting
---
 http/get_simple/README.md               |  2 ++
 http/get_simple/python/server/README.md |  3 ++
 http/get_simple/python/server/server.py | 59 ++++++++++++++++++++++++++-------
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/http/get_simple/README.md b/http/get_simple/README.md
index 65738a2..e641b0c 100644
--- a/http/get_simple/README.md
+++ b/http/get_simple/README.md
@@ -23,6 +23,8 @@ This directory contains a set of minimal examples of HTTP 
clients and servers im
 - How a client can send a GET request to a server and receive a response from 
the server containing an Arrow IPC stream of record batches.
 - How a server can respond to a GET request from a client and send the client 
a response containing an Arrow IPC stream of record batches.
 
+The examples here assume that the server cannot determine the exact length in 
bytes of the full Arrow IPC stream before sending it, so they cannot set the 
`Content-Length` header or serve Range requests.
+
 To enable performance comparisons to Arrow Flight RPC, the server examples 
generate the data in exactly the same way as in 
[`flight_benchmark.cc`](https://github.com/apache/arrow/blob/7346bdffbdca36492089f6160534bfa2b81bad90/cpp/src/arrow/flight/flight_benchmark.cc#L194-L245)
 as cited in the [original blog post introducing Flight 
RPC](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/). But 
note that Flight example sends four concurrent streams.
 
 If you are collaborating on the set of examples in this directory, please 
follow these guidelines:
diff --git a/http/get_simple/python/server/README.md 
b/http/get_simple/python/server/README.md
index 18bc738..ee45903 100644
--- a/http/get_simple/python/server/README.md
+++ b/http/get_simple/python/server/README.md
@@ -30,3 +30,6 @@ To run this example:
 pip install pyarrow
 python server.py
 ```
+
+> [!NOTE]  
+> This example uses Python's built-in 
[`http.server`](https://docs.python.org/3/library/http.server.html) module. 
This server does not implement chunked transfer encoding automatically like 
more sophisticated HTTP servers do, so this example implements it manually, 
with each chunk consisting of one Arrow record batch. Note that in servers that 
implement chunked transfer encoding automatically, each chunk will generally 
not correspond to one Arrow record batch.
diff --git a/http/get_simple/python/server/server.py 
b/http/get_simple/python/server/server.py
index 0f4469d..03113fc 100644
--- a/http/get_simple/python/server/server.py
+++ b/http/get_simple/python/server/server.py
@@ -20,6 +20,9 @@ from random import randbytes
 from http.server import BaseHTTPRequestHandler, HTTPServer
 import io
 
+# use chunked transfer encoding?
+chunked_encoding = True
+
 schema = pa.schema([
     ('a', pa.int64()),
     ('b', pa.int64()),
@@ -58,8 +61,6 @@ def make_reader(schema, batches):
 
 def generate_batches(schema, reader):
     with io.BytesIO() as sink, pa.ipc.new_stream(sink, schema) as writer:
-        yield sink.getvalue()
-        
         for batch in reader:
             sink.seek(0)
             sink.truncate(0)
@@ -73,37 +74,71 @@ def generate_batches(schema, reader):
  
 class MyServer(BaseHTTPRequestHandler):
     def do_GET(self):
+
+        if self.request_version == 'HTTP/1.0':
+            self.protocol_version = 'HTTP/1.0'
+            chunked = False
+        else:
+            self.protocol_version = 'HTTP/1.1'
+            chunked = chunked_encoding
+        
+        self.close_connection = True
         self.send_response(200)
         self.send_header('Content-Type', 'application/vnd.apache.arrow.stream')
         
-        # set these headers if testing with a local browser-based client:
-        
+        ### set these headers if testing with a local browser-based client:
         #self.send_header('Access-Control-Allow-Origin', 
'http://localhost:8000')
         #self.send_header('Access-Control-Allow-Methods', 'GET')
         #self.send_header('Access-Control-Allow-Headers', 'Content-Type')
         
+        ### set this header to make browsers download the file with a name and 
extension:
+        #self.send_header('Content-Disposition', 'attachment; 
filename="data.arrows"')
+        
+        if chunked:
+            self.send_header('Transfer-Encoding', 'chunked')
+        
         self.end_headers()
         
         for buffer in generate_batches(schema, make_reader(schema, batches)):
+            if chunked:
+                
self.wfile.write('{:X}\r\n'.format(len(buffer)).encode('utf-8'))
             self.wfile.write(buffer)
+            if chunked:
+                self.wfile.write('\r\n'.encode('utf-8'))
             self.wfile.flush()
             
-            # if any record batch could be larger than 2 GB, split it
-            # into chunks before passing to self.wfile.write() by 
-            # replacing the two lines above with this:
-            
+            ### if any record batch could be larger than 2 GB, Python's
+            ### http.server will error when calling self.wfile.write(),
+            ### so you will need to split them into smaller chunks by 
+            ### replacing the six lines above with this:
             #chunk_size = int(2e9)
             #chunk_splits = len(buffer) // chunk_size
             #for i in range(chunk_splits):
+            #    if chunked:
+            #        
self.wfile.write('{:X}\r\n'.format(chunk_size).encode('utf-8'))
             #    self.wfile.write(buffer[i * chunk_size:i * chunk_size + 
chunk_size])
+            #    if chunked:
+            #        self.wfile.write('\r\n'.encode('utf-8'))
             #    self.wfile.flush()
+            #last_chunk_size = len(buffer) - (chunk_splits * chunk_size)
+            #if chunked:
+            #    
self.wfile.write('{:X}\r\n'.format(last_chunk_size).encode('utf-8'))
             #self.wfile.write(buffer[chunk_splits * chunk_size:])
+            #if chunked:
+            #    self.wfile.write('\r\n'.encode('utf-8'))
             #self.wfile.flush()
+        
+        if chunked:
+            self.wfile.write('0\r\n\r\n'.encode('utf-8'))
+            self.wfile.flush()
 
 batches = GetPutData()
 
 server_address = ('localhost', 8000)
-httpd = HTTPServer(server_address, MyServer)
-
-print(f'Serving on {server_address[0]}:{server_address[1]}...')
-httpd.serve_forever()
+try:
+    httpd = HTTPServer(server_address, MyServer)
+    print(f'Serving on {server_address[0]}:{server_address[1]}...')
+    httpd.serve_forever()
+except KeyboardInterrupt:
+    print('Shutting down server')
+    httpd.socket.close()

Reply via email to