This is an automated email from the ASF dual-hosted git repository.

ianmcook pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-experiments.git


The following commit(s) were added to refs/heads/main by this push:
     new 46d5547  [http] Add indirect HTTP GET examples (#44)
46d5547 is described below

commit 46d5547cf4ae1185ecc07b0e1edbd5a9f0991661
Author: Ian Cook <[email protected]>
AuthorDate: Mon Dec 2 12:42:18 2024 -0500

    [http] Add indirect HTTP GET examples (#44)
    
    * Add indirect examples
    
    * Bugfix
    
    * Dedup host in curl example
    
    * Improve examples based on review feedback
    
    * Fix formatting
    
    * Improve README
    
    * Improve READMEs
---
 http/get_indirect/README.md                     |  3 ++
 http/get_indirect/curl/.gitignore               | 18 +++++++
 http/get_indirect/{ => curl/client}/README.md   |  8 +--
 http/get_indirect/curl/client/client.sh         | 28 ++++++++++
 http/get_indirect/python/.gitignore             | 18 +++++++
 http/get_indirect/{ => python/client}/README.md | 16 ++++--
 http/get_indirect/python/client/client.py       | 70 +++++++++++++++++++++++++
 http/get_indirect/python/server/README.md       | 53 +++++++++++++++++++
 http/get_indirect/python/server/server.py       | 53 +++++++++++++++++++
 9 files changed, 259 insertions(+), 8 deletions(-)

diff --git a/http/get_indirect/README.md b/http/get_indirect/README.md
index e92fd42..7599c30 100644
--- a/http/get_indirect/README.md
+++ b/http/get_indirect/README.md
@@ -22,3 +22,6 @@
 This directory contains examples of HTTP clients and servers that use a 
two-step sequence to retrieve Arrow data:
 1. The client sends a GET request to a server and receives a JSON response 
from the server containing one or more server URIs.
 2. The client sends GET requests to each of those URIs and receives a response 
from each server containing an Arrow IPC stream of record batches (exactly as 
in the [simple GET 
examples](https://github.com/apache/arrow-experiments/tree/main/http/get_simple)).
+
+> [!IMPORTANT]  
+> The structure of the JSON document in these examples is an illustration, not 
a recommendation. Developers should use JSON document structures appropriate to 
the needs of their applications.
diff --git a/http/get_indirect/curl/.gitignore 
b/http/get_indirect/curl/.gitignore
new file mode 100644
index 0000000..28cc6d4
--- /dev/null
+++ b/http/get_indirect/curl/.gitignore
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+*.arrows
diff --git a/http/get_indirect/README.md 
b/http/get_indirect/curl/client/README.md
similarity index 59%
copy from http/get_indirect/README.md
copy to http/get_indirect/curl/client/README.md
index e92fd42..d1a016b 100644
--- a/http/get_indirect/README.md
+++ b/http/get_indirect/curl/client/README.md
@@ -17,8 +17,8 @@
   under the License.
 -->
 
-# HTTP GET Arrow Data: Indirect Examples
+# HTTP GET Arrow Data: Indirect curl Client Example
 
-This directory contains examples of HTTP clients and servers that use a 
two-step sequence to retrieve Arrow data:
-1. The client sends a GET request to a server and receives a JSON response 
from the server containing one or more server URIs.
-2. The client sends GET requests to each of those URIs and receives a response 
from each server containing an Arrow IPC stream of record batches (exactly as 
in the [simple GET 
examples](https://github.com/apache/arrow-experiments/tree/main/http/get_simple)).
+This directory contains an example of a series of shell commands that use 
`curl` and `jq` to:
+1. Send a GET request to the server to get a JSON listing of the URIs of a set 
of `.arrows` files.
+2. Send GET requests to download each of the `.arrows` files from the server 
to files in the current directory.
diff --git a/http/get_indirect/curl/client/client.sh 
b/http/get_indirect/curl/client/client.sh
new file mode 100644
index 0000000..d03d53b
--- /dev/null
+++ b/http/get_indirect/curl/client/client.sh
@@ -0,0 +1,28 @@
+!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# Use curl to get a JSON document containing URIs to
+# Arrow stream files, then use jq to extract the URIs
+uris=$(curl -s -S http://localhost:8008/ | jq -r '.arrow_stream_files[].uri')
+
+# Use curl to download the files from the URIs in parallel
+if [ -n "$uris" ]; then
+    curl --parallel --remote-name-all $(print $uris)
+fi
diff --git a/http/get_indirect/python/.gitignore 
b/http/get_indirect/python/.gitignore
new file mode 100644
index 0000000..28cc6d4
--- /dev/null
+++ b/http/get_indirect/python/.gitignore
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+*.arrows
diff --git a/http/get_indirect/README.md 
b/http/get_indirect/python/client/README.md
similarity index 55%
copy from http/get_indirect/README.md
copy to http/get_indirect/python/client/README.md
index e92fd42..193f5bb 100644
--- a/http/get_indirect/README.md
+++ b/http/get_indirect/python/client/README.md
@@ -17,8 +17,16 @@
   under the License.
 -->
 
-# HTTP GET Arrow Data: Indirect Examples
+# HTTP GET Arrow Data: Indirect Python Client Example with Requests
 
-This directory contains examples of HTTP clients and servers that use a 
two-step sequence to retrieve Arrow data:
-1. The client sends a GET request to a server and receives a JSON response 
from the server containing one or more server URIs.
-2. The client sends GET requests to each of those URIs and receives a response 
from each server containing an Arrow IPC stream of record batches (exactly as 
in the [simple GET 
examples](https://github.com/apache/arrow-experiments/tree/main/http/get_simple)).
+This directory contains an example of an HTTP client implemented in Python 
using the [Requests](https://requests.readthedocs.io/) library. The client:
+1. Sends a GET request to the server to get a JSON listing of the URIs of 
available `.arrows` files.
+2. Sends GET requests to download each of the `.arrows` files from the server.
+3. Loads the contents of each file into an in-memory PyArrow Table.
+
+To run this example, first start one of the indirect server examples in the 
parent directory, then:
+
+```sh
+pip install requests pyarrow
+python client.py
+```
diff --git a/http/get_indirect/python/client/client.py 
b/http/get_indirect/python/client/client.py
new file mode 100644
index 0000000..8b7f55b
--- /dev/null
+++ b/http/get_indirect/python/client/client.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import requests
+import json
+import os
+import pyarrow as pa
+
+
+HOST = "http://localhost:8008/";
+
+JSON_FORMAT = "application/json"
+ARROW_STREAM_FORMAT = "application/vnd.apache.arrow.stream"
+
+json_response = requests.get(HOST)
+
+response_status = json_response.status_code
+if not response_status == 200:
+    raise ValueError(f"Expected response status 200, got {response_status}")
+
+content_type = json_response.headers.get("Content-Type", "")
+if not content_type.startswith(JSON_FORMAT):
+    raise ValueError(f"Expected content type {JSON_FORMAT}, got 
{content_type}")
+
+print("Downloaded JSON file listing.")
+
+parsed_data = json_response.json()
+uris = [file["uri"] for file in parsed_data["arrow_stream_files"]]
+
+if not all(uri.endswith(".arrows") for uri in uris):
+    raise ValueError(f"Some listed files do not have extension '.arrows'")
+
+print(f"Parsed JSON and found {len(uris)} Arrow stream files.")
+
+tables = {}
+
+for uri in uris:
+    arrow_response = requests.get(uri)
+
+    response_status = arrow_response.status_code
+    if not response_status == 200:
+        raise ValueError(f"Expected response status 200, got 
{response_status}")
+
+    content_type = arrow_response.headers.get("Content-Type", "")
+    if not content_type.startswith(ARROW_STREAM_FORMAT):
+        raise ValueError(f"Expected content type {ARROW_STREAM_FORMAT}, got 
{content_type}")
+    
+    filename = os.path.basename(uri)
+
+    print(f"Downloaded file '{filename}'.")
+
+    tablename = os.path.splitext(filename)[0]
+    with pa.ipc.open_stream(arrow_response.content) as reader:
+        tables[tablename] = reader.read_all()
+
+    print(f"Loaded into in-memory Arrow table '{tablename}'.")
diff --git a/http/get_indirect/python/server/README.md 
b/http/get_indirect/python/server/README.md
new file mode 100644
index 0000000..9780ed8
--- /dev/null
+++ b/http/get_indirect/python/server/README.md
@@ -0,0 +1,53 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# HTTP GET Arrow Data: Indirect Python Server Example
+
+This directory contains an example of an HTTP server implemented in Python 
using the built-in 
[`http.server`](https://docs.python.org/3/library/http.server.html) module. The 
server:
+1. Listens for HTTP GET requests from clients.
+2. Upon receiving a GET request for the document root, serve a JSON document 
that lists the URIs of all the `.arrows` files in the current directory.
+3. Upon receiving a GET request for a specific `.arrows` file, serve that file.
+
+To run this example, first copy two `.arrows` files from the `data` section of 
this repository into the current directory:
+
+```sh
+cp ../../../../data/arrow-commits/arrow-commits.arrows .
+cp ../../../../data/rand-many-types/random.arrows .
+```
+
+Then start the HTTP server:
+
+```sh
+python server.py
+```
+
+In this example, the JSON document listing the URIs of the `.arrows` files is 
structured as shown below. **This JSON structure is provided for example 
purposes only. It is not a recommendation.** Developers should use JSON 
document structures appropriate to the needs of their applications.
+
+```json
+{
+    "arrow_stream_files": [
+        {
+            "uri": "http://127.0.0.1:8008/random.arrows";
+        },
+        {
+            "uri": "http://127.0.0.1:8008/arrow-commits.arrows";
+        }
+    ]
+}
+```
diff --git a/http/get_indirect/python/server/server.py 
b/http/get_indirect/python/server/server.py
new file mode 100644
index 0000000..053b8ef
--- /dev/null
+++ b/http/get_indirect/python/server/server.py
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from http.server import SimpleHTTPRequestHandler, HTTPServer
+import json
+import os
+import mimetypes
+
+mimetypes.add_type("application/vnd.apache.arrow.stream", ".arrows")
+
+class MyServer(SimpleHTTPRequestHandler):
+    def list_directory(self, path):
+        host, port = self.server.server_address
+
+        try:
+            file_paths = [
+                f for f in os.listdir(path)
+                if f.endswith(".arrows") and os.path.isfile(os.path.join(path, 
f))
+            ]
+        except OSError:
+            self.send_error(404, "No permission to list directory")
+            return None
+
+        file_uris = [f"http://{host}:{port}{self.path}{f}"; for f in file_paths]
+        uris_doc = {"arrow_stream_files": [{"uri": f} for f in file_uris]}
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self.end_headers()
+        self.wfile.write(json.dumps(uris_doc, indent=4).encode("utf-8"))
+        return None
+
+server_address = ("localhost", 8008)
+try:
+    httpd = HTTPServer(server_address, MyServer)
+    print(f"Serving on {server_address[0]}:{server_address[1]}...")
+    httpd.serve_forever()
+except KeyboardInterrupt:
+    print("Shutting down server")
+    httpd.socket.close()

Reply via email to