This is an automated email from the ASF dual-hosted git repository.

pvillard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git


The following commit(s) were added to refs/heads/main by this push:
     new 2d82cdc0f5 NIFI-12366 Add HuggingFace support to Pinecone processors
2d82cdc0f5 is described below

commit 2d82cdc0f5ccdd7ee6e98ef1e90b9ceee84c1085
Author: krisztina-zsihovszki <zsikr...@gmail.com>
AuthorDate: Wed Nov 8 17:31:59 2023 +0100

    NIFI-12366 Add HuggingFace support to Pinecone processors
    
    Signed-off-by: Pierre Villard <pierre.villard...@gmail.com>
    
    This closes #8026.
---
 .../src/main/python/vectorstores/EmbeddingUtils.py | 29 ++++++++++++--
 .../src/main/python/vectorstores/PutPinecone.py    | 45 +++++++++++++++++-----
 .../src/main/python/vectorstores/QueryPinecone.py  | 40 +++++++++++++++----
 3 files changed, 94 insertions(+), 20 deletions(-)

diff --git 
a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/EmbeddingUtils.py
 
b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/EmbeddingUtils.py
index b305942da8..9b0218c9c0 100644
--- 
a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/EmbeddingUtils.py
+++ 
b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/EmbeddingUtils.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 
 from nifiapi.properties import PropertyDescriptor, StandardValidators, 
PropertyDependency, ExpressionLanguageScope
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
+
 
 # Embedding Functions
 ONNX_ALL_MINI_LM_L6_V2 = "ONNX all-MiniLM-L6-v2 Model"
@@ -99,13 +102,19 @@ SENTENCE_TRANSFORMER_MODEL_NAME = PropertyDescriptor(
 )
 SENTENCE_TRANSFORMER_DEVICE = PropertyDescriptor(
     name="Sentence Transformer Device Type",
-    description="The type of device to use for performing the embeddings using 
the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc. If not 
specified, a GPU will be used if "
-                + "possible, otherwise a CPU.",
+    description="""The type of device to use for performing the embeddings 
using the Sentence Transformer, such as 'cpu', 'cuda', 'mps', 'cuda:0', etc. 
+                   If not specified, a GPU will be used if possible, otherwise 
a CPU.""",
     validators=[StandardValidators.NON_EMPTY_VALIDATOR],
     required=False,
     dependencies=[PropertyDependency(EMBEDDING_FUNCTION, 
SENTENCE_TRANSFORMERS)]
 )
-
+EMBEDDING_MODEL = PropertyDescriptor(
+    name="Embedding Model",
+    description="Specifies which embedding model should be used in order to 
create embeddings from incoming Documents. Default model is OpenAI.",
+    allowable_values=[HUGGING_FACE, OPENAI],
+    default_value=OPENAI,
+    required=True
+)
 PROPERTIES = [
     EMBEDDING_FUNCTION,
     HUGGING_FACE_MODEL_NAME,
@@ -117,7 +126,8 @@ PROPERTIES = [
     OPENAI_API_TYPE,
     OPENAI_API_VERSION,
     SENTENCE_TRANSFORMER_MODEL_NAME,
-    SENTENCE_TRANSFORMER_DEVICE
+    SENTENCE_TRANSFORMER_DEVICE,
+    EMBEDDING_MODEL
 ]
 
 
@@ -145,3 +155,14 @@ def create_embedding_function(context):
     model_name = 
context.getProperty(SENTENCE_TRANSFORMER_MODEL_NAME).getValue()
     device = context.getProperty(SENTENCE_TRANSFORMER_DEVICE).getValue()
     return SentenceTransformerEmbeddingFunction(model_name=model_name, 
device=device)
+
+
+def create_embedding_service(context):
+    embedding_service = context.getProperty(EMBEDDING_MODEL).getValue()
+
+    if embedding_service == OPENAI:
+        openai_api_key = context.getProperty(OPENAI_API_KEY).getValue()
+        return OpenAIEmbeddings(openai_api_key=openai_api_key)
+    else:
+        huggingface_api_key = 
context.getProperty(HUGGING_FACE_API_KEY).getValue()
+        return HuggingFaceInferenceAPIEmbeddings(api_key=huggingface_api_key)
diff --git 
a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutPinecone.py
 
b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutPinecone.py
index 42f51e0102..409f2b9279 100644
--- 
a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutPinecone.py
+++ 
b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutPinecone.py
@@ -14,11 +14,11 @@
 # limitations under the License.
 
 from langchain.vectorstores import Pinecone
-from langchain.embeddings.openai import OpenAIEmbeddings
 from nifiapi.flowfiletransform import FlowFileTransform, 
FlowFileTransformResult
-from nifiapi.properties import PropertyDescriptor, StandardValidators, 
ExpressionLanguageScope
+from nifiapi.properties import PropertyDescriptor, StandardValidators, 
ExpressionLanguageScope, PropertyDependency
 import pinecone
 import json
+from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, 
create_embedding_service
 
 
 class PutPinecone(FlowFileTransform):
@@ -31,7 +31,6 @@ class PutPinecone(FlowFileTransform):
                        The text must be a string, while metadata must be a map 
with strings for values. Any additional fields will be ignored."""
         tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", 
"ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
 
-
     PINECONE_API_KEY = PropertyDescriptor(
         name="Pinecone API Key",
         description="The API Key to use in order to authentication with 
Pinecone",
@@ -39,12 +38,37 @@ class PutPinecone(FlowFileTransform):
         required=True,
         validators=[StandardValidators.NON_EMPTY_VALIDATOR]
     )
+    HUGGING_FACE_API_KEY = PropertyDescriptor(
+        name="HuggingFace API Key",
+        description="The API Key for interacting with HuggingFace",
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        required=True,
+        sensitive=True,
+        dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
+    )
+    HUGGING_FACE_MODEL = PropertyDescriptor(
+        name="HuggingFace Model",
+        description="The name of the HuggingFace model to use",
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        required=True,
+        default_value="sentence-transformers/all-MiniLM-L6-v2",
+        dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
+    )
     OPENAI_API_KEY = PropertyDescriptor(
         name="OpenAI API Key",
         description="The API Key for OpenAI in order to create embeddings",
         sensitive=True,
         required=True,
-        validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
+    )
+    OPENAI_API_MODEL = PropertyDescriptor(
+        name="OpenAI Model",
+        description="The API Key for OpenAI in order to create embeddings",
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        default_value="text-embedding-ada-002",
+        dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
     )
     PINECONE_ENV = PropertyDescriptor(
         name="Pinecone Environment",
@@ -78,15 +102,19 @@ class PutPinecone(FlowFileTransform):
     )
     DOC_ID_FIELD_NAME = PropertyDescriptor(
         name="Document ID Field Name",
-        description="Specifies the name of the field in the 'metadata' element 
of each document where the document's ID can be found. " +
-                    "If not specified, an ID will be generated based on the 
FlowFile's filename and a one-up number.",
+        description="""Specifies the name of the field in the 'metadata' 
element of each document where the document's ID can be found.  
+                    If not specified, an ID will be generated based on the 
FlowFile's filename and a one-up number.""",
         required=False,
         validators=[StandardValidators.NON_EMPTY_VALIDATOR],
         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
     )
 
     properties = [PINECONE_API_KEY,
+                  EMBEDDING_MODEL,
                   OPENAI_API_KEY,
+                  OPENAI_API_MODEL,
+                  HUGGING_FACE_API_KEY,
+                  HUGGING_FACE_MODEL,
                   PINECONE_ENV,
                   INDEX_NAME,
                   TEXT_KEY,
@@ -110,9 +138,8 @@ class PutPinecone(FlowFileTransform):
             api_key=api_key,
             environment=pinecone_env,
         )
-        openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue()
-        self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
 
+        self.embeddings = create_embedding_service(context)
 
     def transform(self, context, flowfile):
         # First, check if our index already exists. If it doesn't, we create it
@@ -158,4 +185,4 @@ class PutPinecone(FlowFileTransform):
         text_key = 
context.getProperty(self.TEXT_KEY).evaluateAttributeExpressions().getValue()
         vectorstore = Pinecone(index, self.embeddings.embed_query, text_key)
         vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids, 
namespace=namespace)
-        return FlowFileTransformResult(relationship = "success")
+        return FlowFileTransformResult(relationship="success")
diff --git 
a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py
 
b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py
index c0521d1bc9..e12b7a6e77 100644
--- 
a/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py
+++ 
b/nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py
@@ -14,11 +14,11 @@
 # limitations under the License.
 
 from langchain.vectorstores import Pinecone
-from langchain.embeddings.openai import OpenAIEmbeddings
 from nifiapi.flowfiletransform import FlowFileTransform, 
FlowFileTransformResult
-from nifiapi.properties import PropertyDescriptor, StandardValidators, 
ExpressionLanguageScope
+from nifiapi.properties import PropertyDescriptor, StandardValidators, 
ExpressionLanguageScope, PropertyDependency
 import QueryUtils
 import pinecone
+from EmbeddingUtils import OPENAI, HUGGING_FACE, EMBEDDING_MODEL, 
create_embedding_service
 
 
 class QueryPinecone(FlowFileTransform):
@@ -30,7 +30,6 @@ class QueryPinecone(FlowFileTransform):
         description = "Queries Pinecone in order to gather a specified number 
of documents that are most closely related to the given query."
         tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", 
"ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
 
-
     PINECONE_API_KEY = PropertyDescriptor(
         name="Pinecone API Key",
         description="The API Key to use in order to authentication with 
Pinecone",
@@ -43,7 +42,32 @@ class QueryPinecone(FlowFileTransform):
         description="The API Key for OpenAI in order to create embeddings",
         sensitive=True,
         required=True,
-        validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
+    )
+    HUGGING_FACE_API_KEY = PropertyDescriptor(
+        name="HuggingFace API Key",
+        description="The API Key for interacting with HuggingFace",
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        required=True,
+        sensitive=True,
+        dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
+    )
+    OPENAI_MODEL = PropertyDescriptor(
+        name="OpenAI Model",
+        description="The API Key for OpenAI in order to create embeddings",
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        default_value="text-embedding-ada-002",
+        dependencies=[PropertyDependency(EMBEDDING_MODEL, OPENAI)]
+    )
+    HUGGING_FACE_MODEL = PropertyDescriptor(
+        name="HuggingFace Model",
+        description="The name of the HuggingFace model to use",
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        required=True,
+        default_value="sentence-transformers/all-MiniLM-L6-v2",
+        dependencies=[PropertyDependency(EMBEDDING_MODEL, HUGGING_FACE)]
     )
     PINECONE_ENV = PropertyDescriptor(
         name="Pinecone Environment",
@@ -91,9 +115,12 @@ class QueryPinecone(FlowFileTransform):
         expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
     )
 
-
     properties = [PINECONE_API_KEY,
+                  EMBEDDING_MODEL,
                   OPENAI_API_KEY,
+                  OPENAI_MODEL,
+                  HUGGING_FACE_API_KEY,
+                  HUGGING_FACE_MODEL,
                   PINECONE_ENV,
                   INDEX_NAME,
                   QUERY,
@@ -123,8 +150,7 @@ class QueryPinecone(FlowFileTransform):
             api_key=api_key,
             environment=pinecone_env,
         )
-        openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue()
-        self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
+        self.embeddings =  create_embedding_service(context)
         self.query_utils = QueryUtils.QueryUtils(context)
 
 

Reply via email to