This is an automated email from the ASF dual-hosted git repository.
yufei pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/polaris.git
The following commit(s) were added to refs/heads/main by this push:
new ac01e2de Add getting started docker compose for Jupyter with
Spark(#295)
ac01e2de is described below
commit ac01e2de5d38685f47621691ed759e1a1c999f83
Author: Kevin Liu <[email protected]>
AuthorDate: Tue Oct 15 17:56:44 2024 -0400
Add getting started docker compose for Jupyter with Spark(#295)
---
.github/workflows/check-md-link.yml | 2 +-
.gitignore | 4 +-
getting-started/spark/README.md | 45 ++++++++++++++++++++++
.../spark/docker-compose.yml | 8 ++--
.../spark/notebooks}/Dockerfile | 0
.../spark/notebooks}/SparkPolaris.ipynb | 26 ++++++-------
6 files changed, 65 insertions(+), 20 deletions(-)
diff --git a/.github/workflows/check-md-link.yml
b/.github/workflows/check-md-link.yml
index 6cdb4195..00a16359 100644
--- a/.github/workflows/check-md-link.yml
+++ b/.github/workflows/check-md-link.yml
@@ -41,5 +41,5 @@ jobs:
with:
use-quiet-mode: 'yes'
config-file: '.github/workflows/check-md-link-config.json'
- folder-path: 'regtests, regtests/client/python/docs,
regtests/client/python, .github, build-logic, polaris-core, polaris-service,
extension, spec, k8, notebooks'
+ folder-path: 'regtests, regtests/client/python/docs,
regtests/client/python, .github, build-logic, polaris-core, polaris-service,
extension, spec, k8, getting-started'
file-path: 'CHAT_BYLAWS.md, CODE_OF_CONDUCT.md, CONTRIBUTING.md,
README.md SECURITY.md'
diff --git a/.gitignore b/.gitignore
index 3855dceb..62beb3bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,8 +26,8 @@ regtests/output/
/polaris-venv/
/pyproject.toml
-# Notebooks
-notebooks/.ipynb_checkpoints/
+# Notebook Checkpoints
+**/.ipynb_checkpoints/
# Metastore
metastore_db/
diff --git a/getting-started/spark/README.md b/getting-started/spark/README.md
new file mode 100644
index 00000000..55e4f9d9
--- /dev/null
+++ b/getting-started/spark/README.md
@@ -0,0 +1,45 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Getting Started with Apache Spark and Apache Polaris
+
+This getting started guide provides a `docker-compose` file to set up [Apache
Spark](https://spark.apache.org/) with Apache Polaris. Apache Polaris is
configured as an Iceberg REST Catalog in Spark.
+A Jupyter notebook is used to run PySpark.
+
+## Run the `docker-compose` file
+To start the `docker-compose` file, run this command from the repo's root
directory:
+```
+docker-compose -f getting-started/spark/docker-compose.yml up
+```
+
+This will spin up 2 container services
+* The `polaris` service for running Apache Polaris using an in-memory metastore
+* The `jupyter` service for running Jupyter notebook with PySpark
+
+## Access the Jupyter notebook interface
+In the Jupyter notebook container log, look for the URL to access the Jupyter
notebook. The url should be in the format,
`http://127.0.0.1:8888/lab?token=<token>`.
+
+Open the Jupyter notebook in a browser.
+Navigate to
[`notebooks/SparkPolaris.ipynb`](http://127.0.0.1:8888/lab/tree/notebooks/SparkPolaris.ipynb)
<!-- markdown-link-check-disable-line -->
+
+## Change the Polaris credential
+The Polaris service will create a new root crendential on startup, find this
credential in the Polaris service log and change the `polaris_credential`
variable in the first cell of the jupyter notebook
+
+## Run the Jupyter notebook
+You can now run all cells in the notebook or write your own code!
diff --git a/docker-compose-jupyter.yml
b/getting-started/spark/docker-compose.yml
similarity index 88%
rename from docker-compose-jupyter.yml
rename to getting-started/spark/docker-compose.yml
index 97a6d1ce..4bda0320 100644
--- a/docker-compose-jupyter.yml
+++ b/getting-started/spark/docker-compose.yml
@@ -20,7 +20,7 @@
services:
polaris:
build:
- context: .
+ context: ../../
network: host
ports:
- "8181:8181"
@@ -37,8 +37,8 @@ services:
retries: 5
jupyter:
build:
- context: .
- dockerfile: ./notebooks/Dockerfile
+ context: ../../ # this is necessary to expose `regtests/` dir to
notebooks/Dockerfile
+ dockerfile: ./getting-started/spark/notebooks/Dockerfile
network: host
ports:
- "8888:8888"
@@ -57,4 +57,4 @@ volumes:
driver_opts:
o: bind
type: none
- device: ./notebooks
+ device: ./notebooks/
diff --git a/notebooks/Dockerfile b/getting-started/spark/notebooks/Dockerfile
similarity index 100%
rename from notebooks/Dockerfile
rename to getting-started/spark/notebooks/Dockerfile
diff --git a/notebooks/SparkPolaris.ipynb
b/getting-started/spark/notebooks/SparkPolaris.ipynb
similarity index 95%
rename from notebooks/SparkPolaris.ipynb
rename to getting-started/spark/notebooks/SparkPolaris.ipynb
index 6510c670..deb74e92 100644
--- a/notebooks/SparkPolaris.ipynb
+++ b/getting-started/spark/notebooks/SparkPolaris.ipynb
@@ -21,8 +21,11 @@
"from polaris.catalog.api_client import ApiClient as CatalogApiClient\n",
"from polaris.catalog.api_client import Configuration as
CatalogApiClientConfiguration\n",
"\n",
- "client_id = 'b3b6497353b33ea7'\n",
- "client_secret = '623a67ee71d75825238e3e269df5cdac' # pragma: allowlist
secret\n",
+ "# (CHANGE ME): This credential changes on every Polaris service
restart\n",
+ "# In the Polaris log, look for the `realm: default-realm root principal
credentials:` string\n",
+ "polaris_credential = '35df9f8a34199df0:101b9d35700032416210ad2d39b1b4e3'
# pragma: allowlist secret\n",
+ "\n",
+ "client_id, client_secret = polaris_credential.split(\":\")\n",
"client =
CatalogApiClient(CatalogApiClientConfiguration(username=client_id,\n",
" password=client_secret,\n",
"
host='http://polaris:8181/api/catalog'))\n",
@@ -42,8 +45,7 @@
"source": [
"# Create our first catalog\n",
"\n",
- "* Creates a catalog named `polaris_catalog` that writes to a specified
location in S3.\n",
- "* An AWS IAM role is specified - this role is assumed whenever we read or
write data in the catalog"
+ "* Creates a catalog named `polaris_catalog` that writes to a specified
location in the Local Filesystem."
]
},
{
@@ -59,11 +61,9 @@
"
host='http://polaris:8181/api/management/v1'))\n",
"root_client = PolarisDefaultApi(client)\n",
"\n",
- "storage_conf = AwsStorageConfigInfo(storage_type=\"S3\",\n",
- "
allowed_locations=[\"s3://datalake-storage-team/polaris_test/\"],\n",
- "
role_arn=\"arn:aws:iam::631484165566:role/datalake-storage-integration-role\")\n",
+ "storage_conf = FileStorageConfigInfo(storage_type=\"FILE\",
allowed_locations=[\"file:///tmp\"])\n",
"catalog_name = 'polaris_demo'\n",
- "catalog = Catalog(name=catalog_name, type='INTERNAL',
properties={\"default-base-location\":
\"s3://datalake-storage-team/polaris_test/polaris_catalog\"},\n",
+ "catalog = Catalog(name=catalog_name, type='INTERNAL',
properties={\"default-base-location\": \"file:///tmp/polaris/\"},\n",
" storage_config_info=storage_conf)\n",
"catalog.storage_config_info = storage_conf\n",
"root_client.create_catalog(create_catalog_request=CreateCatalogRequest(catalog=catalog))\n",
@@ -272,7 +272,7 @@
" .config(\"spark.sql.catalog.polaris.credential\",
f\"{engineer_principal.credentials.client_id}:{engineer_principal.credentials.client_secret}\")\n",
"\n",
" # Set the warehouse to the name of the catalog we created\n",
- " .config(\"spark.sql.catalog.polaris.warehouse\", 'polaris_demo')\n",
+ " .config(\"spark.sql.catalog.polaris.warehouse\", catalog_name)\n",
"\n",
" # Scope set to PRINCIPAL_ROLE:ALL\n",
" .config(\"spark.sql.catalog.polaris.scope\", 'PRINCIPAL_ROLE:ALL')\n",
@@ -454,7 +454,7 @@
" return codecs.decode(\"1F\",
\"hex\").decode(\"UTF-8\").join(namespace)\n",
"\n",
"# Call loadTable\n",
- "tbl_meta = collado_client.load_table(prefix='polaris_demo',
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE',
x_iceberg_access_delegation='true')\n",
+ "tbl_meta = collado_client.load_table(prefix=catalog_name,
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE',
x_iceberg_access_delegation='true')\n",
"display(JSON(tbl_meta.to_dict(), expanded=True))"
]
},
@@ -604,7 +604,7 @@
},
"outputs": [],
"source": [
- "tbl_meta = pm_client.load_table(prefix='polaris_demo',
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE',
x_iceberg_access_delegation='true')\n",
+ "tbl_meta = pm_client.load_table(prefix=catalog_name,
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE',
x_iceberg_access_delegation='true')\n",
"display(JSON(tbl_meta.to_dict(), expanded=True))"
]
},
@@ -632,7 +632,7 @@
},
"outputs": [],
"source": [
- "pm_client.drop_table(prefix='polaris_demo',
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
+ "pm_client.drop_table(prefix=catalog_name,
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
]
},
{
@@ -775,7 +775,7 @@
"# The ops_client fails to do any real damage even though the engineer
normally has DROP_TABLE privileges\n",
"ops_client =
IcebergCatalogAPI(CatalogApiClient(CatalogApiClientConfiguration(access_token=ops_token.access_token,\n",
" host='http://polaris:8181/api/catalog')))\n",
- "ops_client.drop_table(prefix='polaris_demo',
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
+ "ops_client.drop_table(prefix=catalog_name,
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
]
}
],