This is an automated email from the ASF dual-hosted git repository.
liuxun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gravitino-playground.git
The following commit(s) were added to refs/heads/main by this push:
new a623b65 Add Gravitino LlamaIndex demo (#54)
a623b65 is described below
commit a623b651eb32361282b4c604cc22a150f678b4d7
Author: Jerry Shao <[email protected]>
AuthorDate: Thu Jul 11 19:34:46 2024 +0800
Add Gravitino LlamaIndex demo (#54)
This PR proposes to add Gravitino LlamaIndex demo.
---
.gitignore | 2 +-
README.md | 29 ++
init/jupyter/data/pdfs/Berlin.pdf | Bin 0 -> 2693316 bytes
init/jupyter/data/pdfs/Tokyo.pdf | Bin 0 -> 3274101 bytes
init/jupyter/data/pdfs/Toronto.pdf | Bin 0 -> 3332818 bytes
init/jupyter/gravitino_llamaIndex_demo.ipynb | 562 +++++++++++++++++++++++++++
init/mysql/init.sql | 9 +
init/trino/init.sh | 22 ++
8 files changed, 623 insertions(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index e9c3680..2d4292d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
**/.idea
-
+**/.DS_Store
diff --git a/README.md b/README.md
index 863d035..41ec2ff 100644
--- a/README.md
+++ b/README.md
@@ -224,6 +224,35 @@ union
select * from catalog_iceberg.sales.customers;
```
+### Using Gravitino with LlamaIndex
+
+Gravitino playground also provides a simple RAG demo with LlamaIndex. This
demo will show you the
+ability of using Gravitino to manage both tabular and non-tabular dataset,
connecting to
+LlamaIndex as a unified data source, then use LlamaIndex and LLM to query both
tabular and
+non-tabular data with one natural language query.
+
+The demo is located in the `jupyter` folder, you can open the
`gravitino_llama_index_demo.ipynb`
+demo via Jupyter Notebook by [http://localhost:8888](http://localhost:8888).
+
+The scenario of this demo is that basic structured city statistics data is
stored in MySQL, and
+detailed city introductions are stored in PDF files. The user wants to know
the answers to the
+cities both in the structured data and the PDF files.
+
+In this demo, you will use Gravitino to manage the MySQL table using
relational catalog, pdf
+files using fileset catalog, treated Gravitino as a unified data source for
LlamaIndex to build
+indexes on both tabular and non-tabular data. Then you will use LLM to query
the data with natural
+language queries.
+
+Note: to run this demo, you need to set `OPENAI_API_KEY` in the
`gravitino_llama_index_demo.ipynb`,
+like below, `OPENAI_API_BASE` is optional.
+
+```python
+import os
+
+os.environ["OPENAI_API_KEY"] = ""
+os.environ["OPENAI_API_BASE"] = ""
+```
+
## ASF Incubator disclaimer
Apache Gravitino is an effort undergoing incubation at The Apache Software
Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of
all newly accepted projects until a further review indicates that the
infrastructure, communications, and decision making process have stabilized in
a manner consistent with other successful ASF projects. While incubation status
is not necessarily a reflection of the completeness or stability of the code,
it does indicate that the proje [...]
diff --git a/init/jupyter/data/pdfs/Berlin.pdf
b/init/jupyter/data/pdfs/Berlin.pdf
new file mode 100644
index 0000000..9e95426
Binary files /dev/null and b/init/jupyter/data/pdfs/Berlin.pdf differ
diff --git a/init/jupyter/data/pdfs/Tokyo.pdf b/init/jupyter/data/pdfs/Tokyo.pdf
new file mode 100644
index 0000000..a6dbd0c
Binary files /dev/null and b/init/jupyter/data/pdfs/Tokyo.pdf differ
diff --git a/init/jupyter/data/pdfs/Toronto.pdf
b/init/jupyter/data/pdfs/Toronto.pdf
new file mode 100644
index 0000000..7915de3
Binary files /dev/null and b/init/jupyter/data/pdfs/Toronto.pdf differ
diff --git a/init/jupyter/gravitino_llamaIndex_demo.ipynb
b/init/jupyter/gravitino_llamaIndex_demo.ipynb
new file mode 100644
index 0000000..138a631
--- /dev/null
+++ b/init/jupyter/gravitino_llamaIndex_demo.ipynb
@@ -0,0 +1,562 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "B6If5ZEtpHRP"
+ },
+ "source": [
+ "**Demo Architecture**:\n",
+ "\n",
+ "\n",
+ "\n",
+ "catalog_name = \"catalog_fileset\"\n",
+ "catalog_ident = NameIdentifier.of_catalog(metalake_name, catalog_name)\n",
+ "\n",
+ "schema_name = \"countries\"\n",
+ "schema_ident = NameIdentifier.of_schema(metalake_name, catalog_name,
schema_name)\n",
+ "\n",
+ "fileset_name = \"cities\"\n",
+ "fileset_ident = NameIdentifier.of_fileset(metalake_name, catalog_name,
schema_name, fileset_name)\n",
+ "\n",
+ "gravitino_admin_client = GravitinoAdminClient(uri=gravitino_url)\n",
+ "gravitino_client = GravitinoClient(uri=gravitino_url,
metalake_name=metalake_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "whTwojVtFCFI"
+ },
+ "outputs": [],
+ "source": [
+ "# NOTE: This is ONLY necessary in jupyter notebook.\n",
+ "# Details: Jupyter runs an event-loop behind the scenes.\n",
+ "# This results in nested event-loops when we start an event-loop
to make async queries.\n",
+ "# This is normally not allowed, we use nest_asyncio to allow it
for convenience.\n",
+ "import nest_asyncio\n",
+ "\n",
+ "nest_asyncio.apply()\n",
+ "\n",
+ "import logging\n",
+ "import sys\n",
+ "\n",
+ "logging.basicConfig(stream=sys.stdout, level=logging.WARN)\n",
+
"logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n",
+ "logging.getLogger().setLevel(level=logging.WARN)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "qYFs2EfBDnUe",
+ "outputId": "4233dc7f-ed38-4414-c3d2-77ff0d06264c"
+ },
+ "outputs": [],
+ "source": [
+ "# Create catalog\n",
+ "demo_catalog = None\n",
+ "try:\n",
+ " demo_catalog = gravitino_client.load_catalog(name=catalog_name)\n",
+ "except Exception as e:\n",
+ " demo_catalog = gravitino_client.create_catalog(name=catalog_name,\n",
+ "
catalog_type=Catalog.Type.FILESET,\n",
+ " comment=\"demo\",\n",
+ " provider=\"hadoop\",\n",
+ " properties={})\n",
+ "\n",
+ "print(demo_catalog)\n",
+ "\n",
+ "# Create schema and fileset\n",
+ "schema_countries = None\n",
+ "try:\n",
+ " schema_countries =
demo_catalog.as_schemas().load_schema(ident=schema_ident)\n",
+ "except Exception as e:\n",
+ " schema_countries =
demo_catalog.as_schemas().create_schema(ident=schema_ident,\n",
+ "
comment=\"countries\",\n",
+ "
properties={})\n",
+ "print(schema_countries)\n",
+ "\n",
+ "fileset_cities = None\n",
+ "try:\n",
+ " fileset_cities =
demo_catalog.as_fileset_catalog().load_fileset(ident=fileset_ident)\n",
+ "except Exception as e:\n",
+ " fileset_cities =
demo_catalog.as_fileset_catalog().create_fileset(ident=fileset_ident,\n",
+ "
fileset_type=Fileset.Type.EXTERNAL,\n",
+ "
comment=\"cities\",\n",
+ "
storage_location=\"/tmp/gravitino/data/pdfs\",\n",
+ "
properties={})\n",
+ "print(fileset_cities)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "rMCRxcGvmdcG"
+ },
+ "source": [
+ "Load the fileset from Gravitino"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "gd1C5EIFlFBn",
+ "outputId": "7c897933-60f0-4ee5-e1d1-4a3549c0b102"
+ },
+ "outputs": [],
+ "source": [
+ "# load fileset catalog and fileset\n",
+ "\n",
+ "loaded_catalog_demo = gravitino_client.load_catalog(name=catalog_name)\n",
+ "print(loaded_catalog_demo)\n",
+ "\n",
+ "loaded_schema_countries =
loaded_catalog_demo.as_schemas().load_schema(ident=schema_ident)\n",
+ "print(loaded_schema_countries)\n",
+ "\n",
+ "loaded_fileset_cities =
loaded_catalog_demo.as_fileset_catalog().load_fileset(ident=fileset_ident)\n",
+ "print(loaded_fileset_cities)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "duptP9gtFCFJ"
+ },
+ "source": [
+ "### Create Database Schema + Test Data\n",
+ "\n",
+ "Here we use Trino with our Gravitino connector to query the data from
table."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "YtB8sDFn_30u",
+ "outputId": "8339c681-b05b-41f3-ed81-9cfdf9d24020"
+ },
+ "outputs": [],
+ "source": [
+ "from sqlalchemy import create_engine\n",
+ "from trino.sqlalchemy import URL\n",
+ "from sqlalchemy.sql.expression import select, text\n",
+ "\n",
+ "trino_engine =
create_engine('trino://admin@trino:8080/catalog_mysql/demo_llamaindex')\n",
+ "\n",
+ "connection = trino_engine.connect();\n",
+ "\n",
+ "with trino_engine.connect() as connection:\n",
+ " cursor = connection.exec_driver_sql(\"SELECT * FROM
catalog_mysql.demo_llamaindex.city_stats\")\n",
+ " print(cursor.fetchall())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "xFhZCGinFCFK"
+ },
+ "source": [
+ "### Load Data\n",
+ "\n",
+ "We first show how to convert a Document into a set of Nodes, and insert
into a DocumentStore."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "editable": true,
+ "id": "UG_bXR9klnki",
+ "scrolled": true,
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from llama_index.core import SimpleDirectoryReader\n",
+ "from gravitino import gvfs\n",
+ "\n",
+ "\n",
+ "fs = gvfs.GravitinoVirtualFileSystem(\n",
+ " server_uri=gravitino_url,\n",
+ " metalake_name=metalake_name\n",
+ " )\n",
+ "\n",
+ "fileset_virtual_location =
\"fileset/catalog_fileset/countries/cities\"\n",
+ "\n",
+ "reader = SimpleDirectoryReader(\n",
+ " input_dir=fileset_virtual_location,\n",
+ " fs=fs,\n",
+ " recursive=True)\n",
+ "wiki_docs = reader.load_data()\n",
+ "print(wiki_docs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "cfehU_YNFCFL"
+ },
+ "source": [
+ "### Build SQL Index"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "zPikkpovFCFL"
+ },
+ "outputs": [],
+ "source": [
+ "from llama_index.core import SQLDatabase\n",
+ "\n",
+ "sql_database = SQLDatabase(trino_engine, include_tables=[\"city_stats\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "hWoGhdAwFCFL"
+ },
+ "source": [
+ "### Build Vector Index"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "FSpygguEFCFL"
+ },
+ "outputs": [],
+ "source": [
+ "from llama_index.core import VectorStoreIndex\n",
+ "from llama_index.core import Settings\n",
+ "from llama_index.llms.openai import OpenAI\n",
+ "\n",
+ "\n",
+ "# Insert documents into vector index\n",
+ "# Each document has metadata of the city attached\n",
+ "\n",
+ "vector_indices = {}\n",
+ "vector_query_engines = {}\n",
+ "cities = [\"Toronto\", \"Berlin\", \"Tokyo\"]\n",
+ "\n",
+ "for city, wiki_doc in zip(cities, wiki_docs):\n",
+ " vector_index = VectorStoreIndex.from_documents([wiki_doc])\n",
+ "\n",
+ " query_engine = vector_index.as_query_engine(\n",
+ " similarity_top_k=2, llm=OpenAI(model=\"gpt-3.5-turbo\")\n",
+ " )\n",
+ "\n",
+ " vector_indices[city] = vector_index\n",
+ " vector_query_engines[city] = query_engine"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "xOKVF1RBFCFL"
+ },
+ "source": [
+ "### Define Query Engines, Set as Tools"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "9crZ62XyFCFL"
+ },
+ "outputs": [],
+ "source": [
+ "from llama_index.core.query_engine import NLSQLTableQueryEngine\n",
+ "\n",
+ "sql_query_engine = NLSQLTableQueryEngine(\n",
+ " sql_database=sql_database,\n",
+ " tables=[\"city_stats\"],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "BdW-muMJFCFL"
+ },
+ "outputs": [],
+ "source": [
+ "from llama_index.core.tools import QueryEngineTool\n",
+ "from llama_index.core.tools import ToolMetadata\n",
+ "from llama_index.core.query_engine import SubQuestionQueryEngine\n",
+ "\n",
+ "query_engine_tools = []\n",
+ "for city in cities:\n",
+ " query_engine = vector_query_engines[city]\n",
+ "\n",
+ " query_engine_tool = QueryEngineTool(\n",
+ " query_engine=query_engine,\n",
+ " metadata=ToolMetadata(\n",
+ " name=city, description=f\"Provides information about
{city}\"\n",
+ " ),\n",
+ " )\n",
+ " query_engine_tools.append(query_engine_tool)\n",
+ "\n",
+ "s_engine = SubQuestionQueryEngine.from_defaults(\n",
+ " query_engine_tools=query_engine_tools,
llm=OpenAI(model=\"gpt-3.5-turbo\")\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "hMHicmbSFCFL"
+ },
+ "outputs": [],
+ "source": [
+ "sql_tool = QueryEngineTool.from_defaults(\n",
+ " query_engine=sql_query_engine,\n",
+ " description=(\n",
+ " \"Useful for translating a natural language query into a SQL
query over\"\n",
+ " \" a table containing: city_stats, containing the
population/country of\"\n",
+ " \" each city\"\n",
+ " ),\n",
+ ")\n",
+ "s_engine_tool = QueryEngineTool.from_defaults(\n",
+ " query_engine=s_engine,\n",
+ " description=(\n",
+ " f\"Useful for answering semantic questions about different
cities\"\n",
+ " ),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jkRUpu_hFCFL"
+ },
+ "source": [
+ "### Define SQLJoinQueryEngine"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "mpldz-UDFCFM"
+ },
+ "outputs": [],
+ "source": [
+ "from llama_index.core.query_engine import SQLJoinQueryEngine\n",
+ "\n",
+ "query_engine = SQLJoinQueryEngine(\n",
+ " sql_tool, s_engine_tool, llm=OpenAI(model=\"gpt-4\")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1uSsikJaFCFM",
+ "outputId": "0b4915a1-460e-405e-b66d-b7cf3226d7dd"
+ },
+ "outputs": [],
+ "source": [
+ "response = query_engine.query(\n",
+ " \"Tell me about the arts and culture of the city with the
highest\"\n",
+ " \" population\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "FGNwYX0uFCFM",
+ "outputId": "2a73aee2-077d-4dab-e4be-8e215f462f4a"
+ },
+ "outputs": [],
+ "source": [
+ "print(str(response))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "o8oDWlrCFCFM",
+ "outputId": "f1917229-ae69-4950-89da-9a7ea7c1a35e"
+ },
+ "outputs": [],
+ "source": [
+ "response = query_engine.query(\n",
+ " \"Compare and contrast the demographics of Berlin and Toronto\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "l12E2YmaFCFM",
+ "outputId": "009fd446-80f7-4a01-863c-28d5eec7e778"
+ },
+ "outputs": [],
+ "source": [
+ "print(str(response))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "iH278W87W4Ey",
+ "outputId": "95e7e77e-dacd-484e-ae40-5bd3e7991ea1"
+ },
+ "outputs": [],
+ "source": [
+ "response = query_engine.query(\n",
+ " \"Show me the general history of the cities for countries of Japan
and Germany\"\n",
+ ")\n",
+ "\n",
+ "print(response)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/init/mysql/init.sql b/init/mysql/init.sql
index 40118c9..d0c09c9 100644
--- a/init/mysql/init.sql
+++ b/init/mysql/init.sql
@@ -18,3 +18,12 @@
*/
GRANT ALL PRIVILEGES on *.* to 'mysql'@'%';
FLUSH PRIVILEGES;
+CREATE DATABASE IF NOT EXISTS `demo_llamaindex`;
+CREATE TABLE IF NOT EXISTS `demo_llamaindex`.`city_stats` (
+ `city_name` text,
+ `population` int DEFAULT NULL,
+ `country` text
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
+INSERT INTO `demo_llamaindex`.`city_stats` (city_name, population, country)
VALUES ("Toronto", 2930000, "Canada");
+INSERT INTO `demo_llamaindex`.`city_stats` (city_name, population, country)
VALUES ("Tokyo", 13960000, "Japan");
+INSERT INTO `demo_llamaindex`.`city_stats` (city_name, population, country)
VALUES ("Berlin", 3645000, "Germany");
diff --git a/init/trino/init.sh b/init/trino/init.sh
index c985356..70d6199 100644
--- a/init/trino/init.sh
+++ b/init/trino/init.sh
@@ -72,6 +72,28 @@ else
exit 1
fi
+# Create Postgresql catalog for experience Gravitino service
+curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H "Content-Type:
application/json" -d '{
+ "name":"catalog_mysql",
+ "type":"RELATIONAL",
+ "provider":"jdbc-mysql",
+ "comment":"comment",
+ "properties":{
+ "jdbc-url":"jdbc:mysql://mysql:3306",
+ "jdbc-user":"mysql",
+ "jdbc-password":"mysql",
+ "jdbc-driver": "com.mysql.cj.jdbc.Driver"
+ }
+}' http://gravitino:8090/api/metalakes/metalake_demo/catalogs
+
+response=$(curl -X GET -H "Content-Type: application/json"
http://gravitino:8090/api/metalakes/metalake_demo/catalogs)
+if echo "$response" | grep -q "catalog_mysql"; then
+ echo "Catalog catalog_mysql successfully created"
+else
+ echo "Catalog catalog_mysql create failed"
+ exit 1
+fi
+
# Create Iceberg catalog for experience Gravitino service
curl -X POST -H "Accept: application/vnd.gravitino.v1+json" -H "Content-Type:
application/json" -d '{
"name":"catalog_iceberg",