This is an automated email from the ASF dual-hosted git repository.

yufei pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/polaris.git


The following commit(s) were added to refs/heads/main by this push:
     new ac01e2de Add getting started docker compose for Jupyter with 
Spark(#295)
ac01e2de is described below

commit ac01e2de5d38685f47621691ed759e1a1c999f83
Author: Kevin Liu <[email protected]>
AuthorDate: Tue Oct 15 17:56:44 2024 -0400

    Add getting started docker compose for Jupyter with Spark(#295)
---
 .github/workflows/check-md-link.yml                |  2 +-
 .gitignore                                         |  4 +-
 getting-started/spark/README.md                    | 45 ++++++++++++++++++++++
 .../spark/docker-compose.yml                       |  8 ++--
 .../spark/notebooks}/Dockerfile                    |  0
 .../spark/notebooks}/SparkPolaris.ipynb            | 26 ++++++-------
 6 files changed, 65 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/check-md-link.yml 
b/.github/workflows/check-md-link.yml
index 6cdb4195..00a16359 100644
--- a/.github/workflows/check-md-link.yml
+++ b/.github/workflows/check-md-link.yml
@@ -41,5 +41,5 @@ jobs:
       with:
         use-quiet-mode: 'yes'
         config-file: '.github/workflows/check-md-link-config.json'
-        folder-path: 'regtests, regtests/client/python/docs, 
regtests/client/python, .github, build-logic, polaris-core, polaris-service, 
extension, spec, k8, notebooks'
+        folder-path: 'regtests, regtests/client/python/docs, 
regtests/client/python, .github, build-logic, polaris-core, polaris-service, 
extension, spec, k8, getting-started'
         file-path: 'CHAT_BYLAWS.md, CODE_OF_CONDUCT.md, CONTRIBUTING.md, 
README.md SECURITY.md'
diff --git a/.gitignore b/.gitignore
index 3855dceb..62beb3bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,8 +26,8 @@ regtests/output/
 /polaris-venv/
 /pyproject.toml
 
-# Notebooks
-notebooks/.ipynb_checkpoints/
+# Notebook Checkpoints
+**/.ipynb_checkpoints/
 
 # Metastore
 metastore_db/
diff --git a/getting-started/spark/README.md b/getting-started/spark/README.md
new file mode 100644
index 00000000..55e4f9d9
--- /dev/null
+++ b/getting-started/spark/README.md
@@ -0,0 +1,45 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+ 
+   http://www.apache.org/licenses/LICENSE-2.0
+ 
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Getting Started with Apache Spark and Apache Polaris
+
+This getting started guide provides a `docker-compose` file to set up [Apache 
Spark](https://spark.apache.org/) with Apache Polaris. Apache Polaris is 
configured as an Iceberg REST Catalog in Spark. 
+A Jupyter notebook is used to run PySpark.
+
+## Run the `docker-compose` file
+To start the `docker-compose` file, run this command from the repo's root 
directory:
+```
+docker-compose -f getting-started/spark/docker-compose.yml up 
+```
+
+This will spin up 2 container services
+* The `polaris` service for running Apache Polaris using an in-memory metastore
+* The `jupyter` service for running Jupyter notebook with PySpark
+
+## Access the Jupyter notebook interface
+In the Jupyter notebook container log, look for the URL to access the Jupyter 
notebook. The url should be in the format, 
`http://127.0.0.1:8888/lab?token=<token>`.
+
+Open the Jupyter notebook in a browser.
+Navigate to 
[`notebooks/SparkPolaris.ipynb`](http://127.0.0.1:8888/lab/tree/notebooks/SparkPolaris.ipynb)
 <!-- markdown-link-check-disable-line -->
+
+## Change the Polaris credential
+The Polaris service will create a new root crendential on startup, find this 
credential in the Polaris service log and change the `polaris_credential` 
variable in the first cell of the jupyter notebook
+
+## Run the Jupyter notebook
+You can now run all cells in the notebook or write your own code!
diff --git a/docker-compose-jupyter.yml 
b/getting-started/spark/docker-compose.yml
similarity index 88%
rename from docker-compose-jupyter.yml
rename to getting-started/spark/docker-compose.yml
index 97a6d1ce..4bda0320 100644
--- a/docker-compose-jupyter.yml
+++ b/getting-started/spark/docker-compose.yml
@@ -20,7 +20,7 @@
 services:
   polaris:
     build:
-      context: .
+      context: ../../
       network: host
     ports:
       - "8181:8181"
@@ -37,8 +37,8 @@ services:
       retries: 5
   jupyter:
     build:
-      context: .
-      dockerfile: ./notebooks/Dockerfile
+      context: ../../ # this is necessary to expose `regtests/` dir to 
notebooks/Dockerfile
+      dockerfile: ./getting-started/spark/notebooks/Dockerfile
       network: host
     ports:
       - "8888:8888"
@@ -57,4 +57,4 @@ volumes:
     driver_opts:
       o: bind
       type: none
-      device: ./notebooks
+      device: ./notebooks/
diff --git a/notebooks/Dockerfile b/getting-started/spark/notebooks/Dockerfile
similarity index 100%
rename from notebooks/Dockerfile
rename to getting-started/spark/notebooks/Dockerfile
diff --git a/notebooks/SparkPolaris.ipynb 
b/getting-started/spark/notebooks/SparkPolaris.ipynb
similarity index 95%
rename from notebooks/SparkPolaris.ipynb
rename to getting-started/spark/notebooks/SparkPolaris.ipynb
index 6510c670..deb74e92 100644
--- a/notebooks/SparkPolaris.ipynb
+++ b/getting-started/spark/notebooks/SparkPolaris.ipynb
@@ -21,8 +21,11 @@
     "from polaris.catalog.api_client import ApiClient as CatalogApiClient\n",
     "from polaris.catalog.api_client import Configuration as 
CatalogApiClientConfiguration\n",
     "\n",
-    "client_id = 'b3b6497353b33ea7'\n",
-    "client_secret = '623a67ee71d75825238e3e269df5cdac' # pragma: allowlist 
secret\n",
+    "# (CHANGE ME): This credential changes on every Polaris service 
restart\n",
+    "# In the Polaris log, look for the `realm: default-realm root principal 
credentials:` string\n",
+    "polaris_credential = '35df9f8a34199df0:101b9d35700032416210ad2d39b1b4e3' 
# pragma: allowlist secret\n",
+    "\n",
+    "client_id, client_secret = polaris_credential.split(\":\")\n",
     "client = 
CatalogApiClient(CatalogApiClientConfiguration(username=client_id,\n",
     "                                 password=client_secret,\n",
     "                                 
host='http://polaris:8181/api/catalog'))\n",
@@ -42,8 +45,7 @@
    "source": [
     "# Create our first catalog\n",
     "\n",
-    "* Creates a catalog named `polaris_catalog` that writes to a specified 
location in S3.\n",
-    "* An AWS IAM role is specified - this role is assumed whenever we read or 
write data in the catalog"
+    "* Creates a catalog named `polaris_catalog` that writes to a specified 
location in the Local Filesystem."
    ]
   },
   {
@@ -59,11 +61,9 @@
     "                                   
host='http://polaris:8181/api/management/v1'))\n",
     "root_client = PolarisDefaultApi(client)\n",
     "\n",
-    "storage_conf = AwsStorageConfigInfo(storage_type=\"S3\",\n",
-    "                                  
allowed_locations=[\"s3://datalake-storage-team/polaris_test/\"],\n",
-    "                                  
role_arn=\"arn:aws:iam::631484165566:role/datalake-storage-integration-role\")\n",
+    "storage_conf = FileStorageConfigInfo(storage_type=\"FILE\", 
allowed_locations=[\"file:///tmp\"])\n",
     "catalog_name = 'polaris_demo'\n",
-    "catalog = Catalog(name=catalog_name, type='INTERNAL', 
properties={\"default-base-location\": 
\"s3://datalake-storage-team/polaris_test/polaris_catalog\"},\n",
+    "catalog = Catalog(name=catalog_name, type='INTERNAL', 
properties={\"default-base-location\": \"file:///tmp/polaris/\"},\n",
     "                storage_config_info=storage_conf)\n",
     "catalog.storage_config_info = storage_conf\n",
     
"root_client.create_catalog(create_catalog_request=CreateCatalogRequest(catalog=catalog))\n",
@@ -272,7 +272,7 @@
     "  .config(\"spark.sql.catalog.polaris.credential\", 
f\"{engineer_principal.credentials.client_id}:{engineer_principal.credentials.client_secret}\")\n",
     "\n",
     "  # Set the warehouse to the name of the catalog we created\n",
-    "  .config(\"spark.sql.catalog.polaris.warehouse\", 'polaris_demo')\n",
+    "  .config(\"spark.sql.catalog.polaris.warehouse\", catalog_name)\n",
     "\n",
     "  # Scope set to PRINCIPAL_ROLE:ALL\n",
     "  .config(\"spark.sql.catalog.polaris.scope\", 'PRINCIPAL_ROLE:ALL')\n",
@@ -454,7 +454,7 @@
     "  return codecs.decode(\"1F\", 
\"hex\").decode(\"UTF-8\").join(namespace)\n",
     "\n",
     "# Call loadTable\n",
-    "tbl_meta = collado_client.load_table(prefix='polaris_demo', 
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', 
x_iceberg_access_delegation='true')\n",
+    "tbl_meta = collado_client.load_table(prefix=catalog_name, 
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', 
x_iceberg_access_delegation='true')\n",
     "display(JSON(tbl_meta.to_dict(), expanded=True))"
    ]
   },
@@ -604,7 +604,7 @@
    },
    "outputs": [],
    "source": [
-    "tbl_meta = pm_client.load_table(prefix='polaris_demo', 
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', 
x_iceberg_access_delegation='true')\n",
+    "tbl_meta = pm_client.load_table(prefix=catalog_name, 
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', 
x_iceberg_access_delegation='true')\n",
     "display(JSON(tbl_meta.to_dict(), expanded=True))"
    ]
   },
@@ -632,7 +632,7 @@
    },
    "outputs": [],
    "source": [
-    "pm_client.drop_table(prefix='polaris_demo', 
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
+    "pm_client.drop_table(prefix=catalog_name, 
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
    ]
   },
   {
@@ -775,7 +775,7 @@
     "# The ops_client fails to do any real damage even though the engineer 
normally has DROP_TABLE privileges\n",
     "ops_client = 
IcebergCatalogAPI(CatalogApiClient(CatalogApiClientConfiguration(access_token=ops_token.access_token,\n",
     "              host='http://polaris:8181/api/catalog')))\n",
-    "ops_client.drop_table(prefix='polaris_demo', 
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
+    "ops_client.drop_table(prefix=catalog_name, 
namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
    ]
   }
  ],

Reply via email to