(nifi-minifi-cpp) 01/05: MINIFICPP-2556 Create llama.cpp processor for LLM inference

fgerlits Fri, 16 May 2025 07:33:27 -0700

This is an automated email from the ASF dual-hosted git repository.

fgerlits pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi-minifi-cpp.git


commit 8602b76d27d73fe46f6d177468d7dff400c0a182
Author: Gabor Gyimesi <[email protected]>
AuthorDate: Tue Nov 26 08:13:45 2024 +0100

    MINIFICPP-2556 Create llama.cpp processor for LLM inference
    
    Co-authored-by: Adam Debreceni <[email protected]>
    
    Signed-off-by: Ferenc Gerlits <[email protected]>
    Closes #1903
---
 .github/workflows/ci.yml                           |   7 +-
 LICENSE                                            |  25 ++
 METRICS.md                                         |  16 +
 NOTICE                                             |   1 +
 PROCESSORS.md                                      |  43 +++
 README.md                                          |   3 +-
 bootstrap.sh                                       |   2 +
 bootstrap/minifi_option.py                         |   2 +-
 bstrp_functions.sh                                 |   2 +
 cmake/CppVersion.cmake                             |   4 +-
 cmake/LlamaCpp.cmake                               |  49 +++
 cmake/MiNiFiOptions.cmake                          |   1 +
 docker/test/integration/cluster/ContainerStore.py  |   3 +
 .../test/integration/cluster/DockerTestCluster.py  |   3 +
 docker/test/integration/cluster/ImageStore.py      |  10 +
 .../cluster/containers/MinifiContainer.py          |   3 +
 .../features/MiNiFi_integration_test_driver.py     |   3 +
 docker/test/integration/features/llamacpp.feature  |  32 ++
 docker/test/integration/features/steps/steps.py    |   5 +
 .../minifi/processors/RunLlamaCppInference.py      |  26 ++
 .../include/utils/ProcessorConfigUtils.h           |  10 +
 extensions/llamacpp/CMakeLists.txt                 |  38 +++
 .../llamacpp/processors/DefaultLlamaContext.cpp    | 157 ++++++++++
 .../llamacpp/processors/DefaultLlamaContext.h      |  44 +++
 .../processors/LlamaBackendInitializer.cpp         |  30 ++
 .../llamacpp/processors/LlamaBackendInitializer.h  |  38 +++
 extensions/llamacpp/processors/LlamaContext.h      |  66 ++++
 .../llamacpp/processors/RunLlamaCppInference.cpp   | 155 +++++++++
 .../llamacpp/processors/RunLlamaCppInference.h     | 198 ++++++++++++
 extensions/llamacpp/tests/CMakeLists.txt           |  37 +++
 .../llamacpp/tests/RunLlamaCppInferenceTests.cpp   | 345 +++++++++++++++++++++
 thirdparty/llamacpp/lu8_macro_fix.patch            |  17 +
 utils/include/core/ProcessorMetrics.h              |   1 +
 utils/include/utils/ParsingUtils.h                 |   3 +-
 utils/src/utils/ParsingUtils.cpp                   |   8 +
 35 files changed, 1380 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2d1ce025d..b8eb83667 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,7 +3,7 @@ on: [push, pull_request, workflow_dispatch]
 env:
   DOCKER_CMAKE_FLAGS: -DDOCKER_VERIFY_THREAD=3 -DUSE_SHARED_LIBS= 
-DSTRICT_GSL_CHECKS=AUDIT -DCI_BUILD=ON -DENABLE_AWS=ON -DENABLE_KAFKA=ON 
-DENABLE_MQTT=ON -DENABLE_AZURE=ON -DENABLE_SQL=ON \
     -DENABLE_SPLUNK=ON -DENABLE_GCP=ON -DENABLE_OPC=ON 
-DENABLE_PYTHON_SCRIPTING=ON -DENABLE_LUA_SCRIPTING=ON -DENABLE_KUBERNETES=ON 
-DENABLE_TEST_PROCESSORS=ON -DENABLE_PROMETHEUS=ON \
-    -DENABLE_ELASTICSEARCH=ON -DENABLE_GRAFANA_LOKI=ON -DENABLE_COUCHBASE=ON 
-DDOCKER_BUILD_ONLY=ON -DMINIFI_PERFORMANCE_TESTS=ON
+    -DENABLE_ELASTICSEARCH=ON -DENABLE_GRAFANA_LOKI=ON -DENABLE_COUCHBASE=ON 
-DENABLE_LLAMACPP=ON -DDOCKER_BUILD_ONLY=ON -DMINIFI_PERFORMANCE_TESTS=ON
   SCCACHE_GHA_ENABLE: true
   CCACHE_DIR: ${{ GITHUB.WORKSPACE }}/.ccache
 jobs:
@@ -33,6 +33,7 @@ jobs:
         -DENABLE_GCP=ON
         -DENABLE_KUBERNETES=ON
         -DENABLE_LIBARCHIVE=ON
+        -DENABLE_LLAMACPP=ON
         -DENABLE_KAFKA=ON
         -DENABLE_LUA_SCRIPTING=ON
         -DENABLE_LZMA=ON
@@ -140,6 +141,7 @@ jobs:
         -DENABLE_KUBERNETES=ON
         -DENABLE_LIBARCHIVE=ON
         -DENABLE_KAFKA=ON
+        -DENABLE_LLAMACPP=ON
         -DENABLE_LUA_SCRIPTING=ON
         -DENABLE_LZMA=ON
         -DENABLE_MQTT=ON
@@ -243,6 +245,7 @@ jobs:
         -DENABLE_GRAFANA_LOKI=ON
         -DENABLE_KUBERNETES=ON
         -DENABLE_LIBARCHIVE=ON
+        -DENABLE_LLAMACPP=ON
         -DENABLE_KAFKA=ON
         -DENABLE_LUA_SCRIPTING=ON
         -DENABLE_LZMA=ON
@@ -394,7 +397,7 @@ jobs:
           mkdir build && cd build && cmake -DUSE_SHARED_LIBS=ON -DCI_BUILD=ON 
-DCMAKE_BUILD_TYPE=Release -DSTRICT_GSL_CHECKS=AUDIT 
-DMINIFI_FAIL_ON_WARNINGS=OFF -DENABLE_AWS=ON -DENABLE_AZURE=ON \
               -DENABLE_ENCRYPT_CONFIG=ON -DENABLE_KAFKA=ON -DENABLE_MQTT=ON 
-DENABLE_OPC=ON -DENABLE_OPENCV=ON -DENABLE_OPS=ON -DENABLE_SQL=ON 
-DENABLE_SYSTEMD=ON \
               -DENABLE_PYTHON_SCRIPTING=ON -DENABLE_LUA_SCRIPTING=ON 
-DENABLE_KUBERNETES=ON -DENABLE_GCP=ON -DENABLE_PROCFS=ON 
-DENABLE_PROMETHEUS=ON \
-              -DENABLE_ELASTICSEARCH=ON -DENABLE_GRAFANA_LOKI=ON 
-DDOCKER_SKIP_TESTS=OFF -DDOCKER_BUILD_ONLY=ON 
-DDOCKER_CCACHE_DUMP_LOCATION=${{ env.CCACHE_DIR }} .. && make rocky-test
+              -DENABLE_ELASTICSEARCH=ON -DENABLE_GRAFANA_LOKI=ON 
-DENABLE_LLAMACPP=ON -DDOCKER_SKIP_TESTS=OFF -DDOCKER_BUILD_ONLY=ON 
-DDOCKER_CCACHE_DUMP_LOCATION=${{ env.CCACHE_DIR }} .. && make rocky-test
       - name: cache save
         uses: actions/cache/save@v4
         if: always()
diff --git a/LICENSE b/LICENSE
index fa4c6a70d..2c144ea31 100644
--- a/LICENSE
+++ b/LICENSE
@@ -3460,3 +3460,28 @@ NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
LIABLE FOR ANY CLAIM,
 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+This product bundles 'llama.cpp' which is available under The MIT License.
+
+MIT License
+
+Copyright (c) 2023-2024 The ggml authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/METRICS.md b/METRICS.md
index e28f77a40..e387a74bb 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -32,6 +32,7 @@ This readme defines the metrics published by Apache NiFi. All 
options defined ar
   - [Processor Metrics](#processor-metrics)
     - [General Metrics](#general-metrics)
     - [GetFileMetrics](#getfilemetrics)
+    - [RunLlamaCppInferenceMetrics](#runllamacppinferencemetrics)
 
 ## Description
 
@@ -288,3 +289,18 @@ Processor level metric that reports metrics for the 
GetFile processor if defined
 | metric_class   | Class name to filter for this metric, set to GetFileMetrics 
   |
 | processor_name | Name of the processor                                       
   |
 | processor_uuid | UUID of the processor                                       
   |
+
+### RunLlamaCppInferenceMetrics
+
+Processor level metric that reports metrics for the RunLlamaCppInference 
processor if defined in the flow configuration.
+
+| Metric name           | Labels                                       | 
Description                                                                |
+|-----------------------|----------------------------------------------|----------------------------------------------------------------------------|
+| tokens_in             | metric_class, processor_name, processor_uuid | 
Number of tokens parsed from the input prompts in the processor's lifetime |
+| tokens_out            | metric_class, processor_name, processor_uuid | 
Number of tokens generated in the completion in the processor's lifetime   |
+
+| Label          | Description                                                 
             |
+|----------------|--------------------------------------------------------------------------|
+| metric_class   | Class name to filter for this metric, set to 
RunLlamaCppInferenceMetrics |
+| processor_name | Name of the processor                                       
             |
+| processor_uuid | UUID of the processor                                       
             |
diff --git a/NOTICE b/NOTICE
index cf9247053..e32c36117 100644
--- a/NOTICE
+++ b/NOTICE
@@ -77,6 +77,7 @@ This software includes third party software subject to the 
following copyrights:
 - snappy - Copyright 2011, Google Inc.
 - llhttp - Copyright Fedor Indutny, 2018.
 - benchmark - Copyright 2015 Google Inc.
+- llama.cpp - Copyright (c) 2023-2024 The ggml authors
 
 The licenses for these third party components are included in LICENSE.txt
 
diff --git a/PROCESSORS.md b/PROCESSORS.md
index b4a4f104e..862a1182e 100644
--- a/PROCESSORS.md
+++ b/PROCESSORS.md
@@ -65,6 +65,7 @@ limitations under the License.
 - [ListS3](#ListS3)
 - [ListSFTP](#ListSFTP)
 - [ListSmb](#ListSmb)
+- [RunLlamaCppInference](#RunLlamaCppInference)
 - [LogAttribute](#LogAttribute)
 - [ManipulateArchive](#ManipulateArchive)
 - [MergeContent](#MergeContent)
@@ -1745,6 +1746,48 @@ In the list below, the names of required properties 
appear in bold. Any other pr
 | size             | success      | The size of the file in bytes.             
                                                                                
                                                                                
                                                                                
                                                                  |
 
 
+## RunLlamaCppInference
+
+### Description
+
+LlamaCpp processor to use llama.cpp library for running language model 
inference. The inference will be based on the System Prompt and the Prompt 
property values, together with the content of the incoming flow file. In the 
Prompt, the content of the incoming flow file can be referred to as 'the input 
data' or 'the flow file content'.
+
+### Properties
+
+In the list below, the names of required properties appear in bold. Any other 
properties (not in bold) are considered optional. The table also indicates any 
default values, and whether a property supports the NiFi Expression Language.
+
+| Name                             | Default Value                             
                                                                                
                                                                                
| Allowable Values | Description                                                
                                                             |
+|----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------|-------------------------------------------------------------------------------------------------------------------------|
+| **Model Path**                   |                                           
                                                                                
                                                                                
|                  | The filesystem path of the model file in gguf format.      
                                                             |
+| Temperature                      | 0.8                                       
                                                                                
                                                                                
|                  | The temperature to use for sampling.                       
                                                             |
+| Top K                            | 40                                        
                                                                                
                                                                                
|                  | Limit the next token selection to the K most probable 
tokens. Set <= 0 value to use vocab size.                         |
+| Top P                            | 0.9                                       
                                                                                
                                                                                
|                  | Limit the next token selection to a subset of tokens with 
a cumulative probability above a threshold P. 1.0 = disabled. |
+| Min P                            |                                           
                                                                                
                                                                                
|                  | Sets a minimum base probability threshold for token 
selection. 0.0 = disabled.                                          |
+| **Min Keep**                     | 0                                         
                                                                                
                                                                                
|                  | If greater than 0, force samplers to return N possible 
tokens at minimum.                                               |
+| **Text Context Size**            | 4096                                      
                                                                                
                                                                                
|                  | Size of the text context, use 0 to use size set in model.  
                                                             |
+| **Logical Maximum Batch Size**   | 2048                                      
                                                                                
                                                                                
|                  | Logical maximum batch size that can be submitted to the 
llama.cpp decode function.                                      |
+| **Physical Maximum Batch Size**  | 512                                       
                                                                                
                                                                                
|                  | Physical maximum batch size.                               
                                                             |
+| **Max Number Of Sequences**      | 1                                         
                                                                                
                                                                                
|                  | Maximum number of sequences (i.e. distinct states for 
recurrent models).                                                |
+| **Threads For Generation**       | 4                                         
                                                                                
                                                                                
|                  | Number of threads to use for generation.                   
                                                             |
+| **Threads For Batch Processing** | 4                                         
                                                                                
                                                                                
|                  | Number of threads to use for batch processing.             
                                                             |
+| Prompt                           |                                           
                                                                                
                                                                                
|                  | The user prompt for the inference.<br/>**Supports 
Expression Language: true**                                           |
+| System Prompt                    | You are a helpful assistant. You are 
given a question with some possible input data otherwise called flow file 
content. You are expected to generate a response based on the question and the 
input data. |                  | The system prompt for the inference.           
                                                                         |
+
+### Relationships
+
+| Name    | Description                      |
+|---------|----------------------------------|
+| success | Generated results from the model |
+| failure | Generation failed                |
+
+### Output Attributes
+
+| Attribute                    | Relationship | Description                    
                |
+|------------------------------|--------------|------------------------------------------------|
+| llamacpp.time.to.first.token | success      | Time to first token generated 
in milliseconds. |
+| llamacpp.tokens.per.second   | success      | Tokens generated per second.   
                |
+
+
 ## LogAttribute
 
 ### Description
diff --git a/README.md b/README.md
index 380eeeaff..790238afa 100644
--- a/README.md
+++ b/README.md
@@ -81,8 +81,9 @@ The next table outlines CMAKE flags that correspond with 
MiNiFi extensions. Exte
 | ExecuteProcess (Linux and macOS) | 
[ExecuteProcess](PROCESSORS.md#executeprocess)                                  
                                                                                
                                                                                
                                                                                
                                                                                
                                                        [...]
 | Google Cloud Platform            | 
[DeleteGCSObject](PROCESSORS.md#deletegcsobject)<br>[FetchGCSObject](PROCESSORS.md#fetchgcsobject)<br>[GCPCredentialsControllerService](CONTROLLERS.md#gcpcredentialscontrollerservice)<br>[ListGCSBucket](PROCESSORS.md#listgcsbucket)<br>[PutGCSObject](PROCESSORS.md#putgcsobject)
                                                                                
                                                                                
                   [...]
 | Grafana Loki                     | 
[PushGrafanaLokiREST](PROCESSORS.md#pushgrafanalokirest)<br>[PushGrafanaLokiGrpc](PROCESSORS.md#pushgrafanalokigrpc)
                                                                                
                                                                                
                                                                                
                                                                                
                    [...]
-| Kafka                            | 
[PublishKafka](PROCESSORS.md#publishkafka)<br>[ConsumeKafka](PROCESSORS.md#consumekafka)
                                                                                
                                                                                
                                                                                
                                                                                
                                                [...]
+| Kafka                            | 
[PublishKafka](PROCESSORS.md#publishkafka)<br>[ConsumeKafka](PROCESSORS.md#consumekafka)
                                                                                
                                                                                
                                                                                
                                                                                
                                                [...]
 | Kubernetes (Linux)               | 
[KubernetesControllerService](CONTROLLERS.md#kubernetescontrollerservice)       
                                                                                
                                                                                
                                                                                
                                                                                
                                                        [...]
+| LlamaCpp                         | 
[RunLlamaCppInference](PROCESSORS.md#runllamacppinference)                      
                                                                                
                                                                                
                                                                                
                                                                                
                                                        [...]
 | Lua Scripting                    | 
[ExecuteScript](PROCESSORS.md#executescript)                                    
                                                                                
                                                                                
                                                                                
                                                                                
                                                        [...]
 | MQTT                             | 
[ConsumeMQTT](PROCESSORS.md#consumemqtt)<br/>[PublishMQTT](PROCESSORS.md#publishmqtt)
                                                                                
                                                                                
                                                                                
                                                                                
                                                   [...]
 | OPC                              | 
[FetchOPCProcessor](PROCESSORS.md#fetchopcprocessor)<br/>[PutOPCProcessor](PROCESSORS.md#putopcprocessor)
                                                                                
                                                                                
                                                                                
                                                                                
                               [...]
diff --git a/bootstrap.sh b/bootstrap.sh
index c723327fd..e0113df11 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -318,6 +318,8 @@ add_option PROMETHEUS_ENABLED ${TRUE} "ENABLE_PROMETHEUS"
 
 add_option COUCHBASE_ENABLED ${FALSE} "ENABLE_COUCHBASE"
 
+add_option LLAMACPP_ENABLED ${FALSE} "ENABLE_LLAMACPP"
+
 USE_SHARED_LIBS=${TRUE}
 ASAN_ENABLED=${FALSE}
 MINIFI_FAIL_ON_WARNINGS=${FALSE}
diff --git a/bootstrap/minifi_option.py b/bootstrap/minifi_option.py
index 5f96b97fc..853acd86f 100644
--- a/bootstrap/minifi_option.py
+++ b/bootstrap/minifi_option.py
@@ -31,7 +31,7 @@ class MinifiOptions:
         self.build_type = CMakeCacheValue("Specifies the build type on 
single-configuration generators",
                                           "CMAKE_BUILD_TYPE", "STRING", 
"Release")
         self.build_type.possible_values = ["Release", "Debug", 
"RelWithDebInfo", "MinSizeRel"]
-        additional_build_options = ["DOCKER_BUILD_ONLY", "DOCKER_SKIP_TESTS", 
"SKIP_TESTS"]
+        additional_build_options = ["DOCKER_BUILD_ONLY", "DOCKER_SKIP_TESTS", 
"SKIP_TESTS", "PORTABLE"]
         self.use_ninja = CMakeCacheValue("Specifies if CMake should use the 
Ninja generator or the system default", "USE_NINJA", "BOOL", "ON")
         self.bool_options = {name: cache_value for name, cache_value in 
cache_values.items() if
                              cache_value.value_type == "BOOL" and ("ENABLE" in 
name or "MINIFI" in name or name in additional_build_options)}
diff --git a/bstrp_functions.sh b/bstrp_functions.sh
index 7311eb166..f78220a9d 100755
--- a/bstrp_functions.sh
+++ b/bstrp_functions.sh
@@ -402,6 +402,7 @@ show_supported_features() {
   echo "AF. Elasticsearch Support ......$(print_feature_status 
ELASTIC_ENABLED)"
   echo "AG. Grafana Loki Support .......$(print_feature_status 
GRAFANA_LOKI_ENABLED)"
   echo "AH. Couchbase Support ..........$(print_feature_status 
COUCHBASE_ENABLED)"
+  echo "AI. llama.cpp Support ..........$(print_feature_status 
LLAMACPP_ENABLED)"
   echo "****************************************"
   echo "            Build Options."
   echo "****************************************"
@@ -450,6 +451,7 @@ read_feature_options(){
     af) ToggleFeature ELASTIC_ENABLED ;;
     ag) ToggleFeature GRAFANA_LOKI_ENABLED ;;
     ah) ToggleFeature COUCHBASE_ENABLED ;;
+    ai) ToggleFeature LLAMACPP_ENABLED ;;
     1) ToggleFeature TESTS_ENABLED ;;
     2) EnableAllFeatures ;;
     4) ToggleFeature USE_SHARED_LIBS;;
diff --git a/cmake/CppVersion.cmake b/cmake/CppVersion.cmake
index 22772d26d..c356e6b40 100644
--- a/cmake/CppVersion.cmake
+++ b/cmake/CppVersion.cmake
@@ -18,8 +18,8 @@
 function(set_cpp_version)
     if (MSVC)
         if ((MSVC_VERSION GREATER "1930") OR (MSVC_VERSION EQUAL "1930"))
-            add_compile_options("/std:c++latest")
-            add_compile_options("/permissive-")
+            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/std:c++latest>)
+            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/permissive->)
         else()
             message(STATUS "The Visual Studio C++ compiler 
${CMAKE_CXX_COMPILER} is not supported. Please use Visual Studio 2022 or 
newer.")
         endif()
diff --git a/cmake/LlamaCpp.cmake b/cmake/LlamaCpp.cmake
new file mode 100644
index 000000000..af4e94994
--- /dev/null
+++ b/cmake/LlamaCpp.cmake
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include(FetchContent)
+
+set(BUILD_SHARED_LIBS "OFF" CACHE STRING "" FORCE)
+set(LLAMA_BUILD_TESTS "OFF" CACHE STRING "" FORCE)
+set(LLAMA_BUILD_EXAMPLES "OFF" CACHE STRING "" FORCE)
+set(LLAMA_BUILD_SERVER "OFF" CACHE STRING "" FORCE)
+set(GGML_OPENMP "OFF" CACHE STRING "" FORCE)
+set(GGML_METAL "OFF" CACHE STRING "" FORCE)
+set(GGML_BLAS "OFF" CACHE STRING "" FORCE)
+if (PORTABLE)
+    set(GGML_NATIVE "OFF" CACHE STRING "" FORCE)
+else()
+    set(GGML_NATIVE "ON" CACHE STRING "" FORCE)
+endif()
+
+set(PATCH_FILE_1 
"${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/lu8_macro_fix.patch")  # 
https://github.com/ggml-org/llama.cpp/issues/12740
+set(PC ${Bash_EXECUTABLE}  -c "set -x &&\
+        (\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i 
\\\"${PATCH_FILE_1}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i 
\\\"${PATCH_FILE_1}\\\")")
+
+FetchContent_Declare(llamacpp
+        URL 
https://github.com/ggerganov/llama.cpp/archive/refs/tags/b5038.tar.gz
+        URL_HASH 
SHA256=5e81c3badc181ed3b7a6ab6bda2abedc80c52527e3c079c7afff4c09f4843564
+        PATCH_COMMAND "${PC}"
+)
+
+FetchContent_MakeAvailable(llamacpp)
+
+set(LLAMACPP_INCLUDE_DIRS
+    "${llamacpp_SOURCE_DIR}/include"
+    "${llamacpp_SOURCE_DIR}/ggml/include"
+    CACHE STRING "" FORCE
+)
diff --git a/cmake/MiNiFiOptions.cmake b/cmake/MiNiFiOptions.cmake
index 5dd4d0920..8fb152b33 100644
--- a/cmake/MiNiFiOptions.cmake
+++ b/cmake/MiNiFiOptions.cmake
@@ -115,6 +115,7 @@ add_minifi_option(ENABLE_GRPC_FOR_LOKI "Enable gRPC for 
Grafana Loki extension"
 add_minifi_option(ENABLE_COUCHBASE "Enable Couchbase support" OFF)
 add_minifi_option(ENABLE_EXECUTE_PROCESS "Enable ExecuteProcess processor" OFF)
 add_minifi_option(ENABLE_CONTROLLER "Enables the build of MiNiFi controller 
binary." ON)
+add_minifi_option(ENABLE_LLAMACPP "Enables llama.cpp support." OFF)
 
 set_minifi_cache_variable(CUSTOM_MALLOC OFF "Overwrite malloc implementation.")
 set_property(CACHE CUSTOM_MALLOC PROPERTY STRINGS "jemalloc" "mimalloc" 
"rpmalloc" OFF)
diff --git a/docker/test/integration/cluster/ContainerStore.py 
b/docker/test/integration/cluster/ContainerStore.py
index 4939ad098..79f88fb89 100644
--- a/docker/test/integration/cluster/ContainerStore.py
+++ b/docker/test/integration/cluster/ContainerStore.py
@@ -406,6 +406,9 @@ class ContainerStore:
     def disable_openssl_fips_mode_in_minifi(self):
         self.minifi_options.enable_openssl_fips_mode = False
 
+    def llama_model_is_downloaded_in_minifi(self):
+        self.minifi_options.download_llama_model = True
+
     def get_startup_finished_log_entry(self, container_name):
         container_name = self.get_container_name_with_postfix(container_name)
         return self.containers[container_name].get_startup_finished_log_entry()
diff --git a/docker/test/integration/cluster/DockerTestCluster.py 
b/docker/test/integration/cluster/DockerTestCluster.py
index 7d7e7cb93..faa42b727 100644
--- a/docker/test/integration/cluster/DockerTestCluster.py
+++ b/docker/test/integration/cluster/DockerTestCluster.py
@@ -147,6 +147,9 @@ class DockerTestCluster:
     def enable_example_minifi_python_processors(self):
         self.container_store.enable_example_minifi_python_processors()
 
+    def llama_model_is_downloaded_in_minifi(self):
+        self.container_store.llama_model_is_downloaded_in_minifi()
+
     def get_app_log(self, container_name):
         container_name = 
self.container_store.get_container_name_with_postfix(container_name)
         log_source = self.container_store.log_source(container_name)
diff --git a/docker/test/integration/cluster/ImageStore.py 
b/docker/test/integration/cluster/ImageStore.py
index 6f3218452..100b1c22b 100644
--- a/docker/test/integration/cluster/ImageStore.py
+++ b/docker/test/integration/cluster/ImageStore.py
@@ -57,6 +57,8 @@ class ImageStore:
             image = 
self.__build_minifi_cpp_image_with_nifi_python_processors_using_dependencies(PythonWithDependenciesOptions.INLINE_DEFINED_PACKAGES)
         elif container_engine == 
"minifi-cpp-nifi-with-python-without-dependencies":
             image = self.__build_minifi_cpp_image_with_nifi_python_processors()
+        elif container_engine == "minifi-cpp-with-llamacpp-model":
+            image = self.__build_minifi_cpp_image_with_llamacpp_model()
         elif container_engine == "http-proxy":
             image = self.__build_http_proxy_image()
         elif container_engine == "postgresql-server":
@@ -225,6 +227,14 @@ class ImageStore:
             build_full_python_resource_path("TestStateManager.py"),
         ])
 
+    def __build_minifi_cpp_image_with_llamacpp_model(self):
+        dockerfile = dedent("""\
+                FROM {base_image}
+                RUN mkdir /opt/minifi/minifi-current/models && wget 
https://huggingface.co/bartowski/Qwen2-0.5B-Instruct-GGUF/resolve/main/Qwen2-0.5B-Instruct-IQ3_M.gguf
 --directory-prefix=/opt/minifi/minifi-current/models
+                """.format(base_image='apacheminificpp:' + 
MinifiContainer.MINIFI_TAG_PREFIX + MinifiContainer.MINIFI_VERSION))
+
+        return self.__build_image(dockerfile)
+
     def __build_http_proxy_image(self):
         dockerfile = dedent("""\
                 FROM {base_image}
diff --git a/docker/test/integration/cluster/containers/MinifiContainer.py 
b/docker/test/integration/cluster/containers/MinifiContainer.py
index 381388633..e6b3cf68a 100644
--- a/docker/test/integration/cluster/containers/MinifiContainer.py
+++ b/docker/test/integration/cluster/containers/MinifiContainer.py
@@ -48,6 +48,7 @@ class MinifiOptions:
             self.enable_openssl_fips_mode = True
         else:
             self.enable_openssl_fips_mode = False
+        self.download_llama_model = False
 
 
 class MinifiContainer(FlowContainer):
@@ -196,6 +197,8 @@ class MinifiContainer(FlowContainer):
             image = 
self.image_store.get_image('minifi-cpp-nifi-with-inline-python-dependencies')
         elif self.options.use_nifi_python_processors_without_dependencies:
             image = 
self.image_store.get_image('minifi-cpp-nifi-with-python-without-dependencies')
+        elif self.options.download_llama_model:
+            image = 
self.image_store.get_image('minifi-cpp-with-llamacpp-model')
         else:
             image = 'apacheminificpp:' + MinifiContainer.MINIFI_TAG_PREFIX + 
MinifiContainer.MINIFI_VERSION
 
diff --git a/docker/test/integration/features/MiNiFi_integration_test_driver.py 
b/docker/test/integration/features/MiNiFi_integration_test_driver.py
index bb3a4090a..71a280f7e 100644
--- a/docker/test/integration/features/MiNiFi_integration_test_driver.py
+++ b/docker/test/integration/features/MiNiFi_integration_test_driver.py
@@ -404,6 +404,9 @@ class MiNiFi_integration_test:
     def set_controller_socket_properties_in_minifi(self):
         self.cluster.set_controller_socket_properties_in_minifi()
 
+    def llama_model_is_downloaded_in_minifi(self):
+        self.cluster.llama_model_is_downloaded_in_minifi()
+
     def update_flow_config_through_controller(self, container_name: str):
         self.cluster.update_flow_config_through_controller(container_name)
 
diff --git a/docker/test/integration/features/llamacpp.feature 
b/docker/test/integration/features/llamacpp.feature
new file mode 100644
index 000000000..de8072d39
--- /dev/null
+++ b/docker/test/integration/features/llamacpp.feature
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+@ENABLE_LLAMACPP
+Feature: Run language model inference using LlamaCpp processor
+
+  Background:
+    Given the content of "/tmp/output" is monitored
+
+  Scenario: Test inference with a small model
+    Given a LlamaCpp model is present on the MiNiFi host
+    And a GenerateFlowFile processor with the "File Size" property set to "0B"
+    And a RunLlamaCppInference processor with the "Model Path" property set to 
"/opt/minifi/minifi-current/models/Qwen2-0.5B-Instruct-IQ3_M.gguf"
+    And the "Prompt" property of the RunLlamaCppInference processor is set to 
"Repeat after me: banana banana banana"
+    And a LogAttribute processor with the "Log Payload" property set to "true"
+    And the "success" relationship of the GenerateFlowFile processor is 
connected to the RunLlamaCppInference
+    And the "success" relationship of the RunLlamaCppInference processor is 
connected to the LogAttribute
+
+    When all instances start up
+    Then the Minifi logs contain the following message: "banana" in less than 
120 seconds
diff --git a/docker/test/integration/features/steps/steps.py 
b/docker/test/integration/features/steps/steps.py
index 76e91d59e..46e4e81ca 100644
--- a/docker/test/integration/features/steps/steps.py
+++ b/docker/test/integration/features/steps/steps.py
@@ -1430,3 +1430,8 @@ def step_impl(context, service_name):
         
connection_string="couchbases://{server_hostname}".format(server_hostname=context.test.get_container_name_with_postfix("couchbase-server")),
         ssl_context_service=ssl_context_service)
     container.add_controller(couchbase_cluster_controller_service)
+
+
+@given("a LlamaCpp model is present on the MiNiFi host")
+def step_impl(context):
+    context.test.llama_model_is_downloaded_in_minifi()
diff --git a/docker/test/integration/minifi/processors/RunLlamaCppInference.py 
b/docker/test/integration/minifi/processors/RunLlamaCppInference.py
new file mode 100644
index 000000000..fd69d7d43
--- /dev/null
+++ b/docker/test/integration/minifi/processors/RunLlamaCppInference.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ..core.Processor import Processor
+
+
+class RunLlamaCppInference(Processor):
+    def __init__(self, context, schedule={'scheduling strategy': 
'EVENT_DRIVEN'}):
+        super(RunLlamaCppInference, self).__init__(
+            context=context,
+            clazz='RunLlamaCppInference',
+            auto_terminate=['success', 'failure'],
+            schedule=schedule)
diff --git a/extension-utils/include/utils/ProcessorConfigUtils.h 
b/extension-utils/include/utils/ProcessorConfigUtils.h
index a9ec60770..321f4a9da 100644
--- a/extension-utils/include/utils/ProcessorConfigUtils.h
+++ b/extension-utils/include/utils/ProcessorConfigUtils.h
@@ -107,6 +107,16 @@ inline std::optional<uint64_t> 
parseOptionalDataSizeProperty(const core::Process
   return std::nullopt;
 }
 
+inline std::optional<float> parseOptionalFloatProperty(const 
core::ProcessContext& ctx, const core::PropertyReference& property, const 
core::FlowFile* flow_file = nullptr) {
+  if (const auto property_str = ctx.getProperty(property.name, flow_file)) {
+    if (property_str->empty()) {
+      return std::nullopt;
+    }
+    return parsing::parseFloat(*property_str) | 
utils::orThrow(fmt::format("Expected parsable float from {}::{}", 
ctx.getProcessor().getName(), property.name));
+  }
+  return std::nullopt;
+}
+
 template<typename T>
 T parseEnumProperty(const core::ProcessContext& context, const 
core::PropertyReference& prop, const core::FlowFile* flow_file = nullptr) {
   const auto enum_str = context.getProperty(prop.name, flow_file);
diff --git a/extensions/llamacpp/CMakeLists.txt 
b/extensions/llamacpp/CMakeLists.txt
new file mode 100644
index 000000000..5a25cbc05
--- /dev/null
+++ b/extensions/llamacpp/CMakeLists.txt
@@ -0,0 +1,38 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+if (NOT (ENABLE_ALL OR ENABLE_LLAMACPP))
+    return()
+endif()
+
+include(LlamaCpp)
+
+include(${CMAKE_SOURCE_DIR}/extensions/ExtensionHeader.txt)
+
+file(GLOB SOURCES "processors/*.cpp")
+
+add_minifi_library(minifi-llamacpp SHARED ${SOURCES})
+target_include_directories(minifi-llamacpp PUBLIC 
"${CMAKE_SOURCE_DIR}/extensions/llamacpp")
+target_include_directories(minifi-llamacpp PUBLIC "${LLAMACPP_INCLUDE_DIRS}")
+
+target_link_libraries(minifi-llamacpp ${LIBMINIFI} llama)
+
+register_extension(minifi-llamacpp "LLAMACPP EXTENSION" LLAMACPP-EXTENSION 
"Provides llama.cpp support" "extensions/llamacpp/tests")
+
+register_extension_linter(minifi-llamacpp-linter)
diff --git a/extensions/llamacpp/processors/DefaultLlamaContext.cpp 
b/extensions/llamacpp/processors/DefaultLlamaContext.cpp
new file mode 100644
index 000000000..119f5a63b
--- /dev/null
+++ b/extensions/llamacpp/processors/DefaultLlamaContext.cpp
@@ -0,0 +1,157 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DefaultLlamaContext.h"
+#include "Exception.h"
+#include "fmt/format.h"
+#include "utils/ConfigurationUtils.h"
+
+namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
+
+namespace {
+std::vector<llama_token> tokenizeInput(const llama_vocab* vocab, const 
std::string& input) {
+  int32_t number_of_tokens = gsl::narrow<int32_t>(input.length()) + 2;
+  std::vector<llama_token> tokenized_input(number_of_tokens);
+  number_of_tokens = llama_tokenize(vocab, input.data(), 
gsl::narrow<int32_t>(input.length()), tokenized_input.data(), 
gsl::narrow<int32_t>(tokenized_input.size()), true, true);
+  if (number_of_tokens < 0) {
+    tokenized_input.resize(-number_of_tokens);
+    [[maybe_unused]] int32_t check = llama_tokenize(vocab, input.data(), 
gsl::narrow<int32_t>(input.length()), tokenized_input.data(), 
gsl::narrow<int32_t>(tokenized_input.size()), true, true);
+    gsl_Assert(check == -number_of_tokens);
+  } else {
+    tokenized_input.resize(number_of_tokens);
+  }
+  return tokenized_input;
+}
+}  // namespace
+
+
+DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& 
model_path, const LlamaSamplerParams& llama_sampler_params, const 
LlamaContextParams& llama_ctx_params) {
+  llama_model_ = llama_model_load_from_file(model_path.string().c_str(), 
llama_model_default_params());  // 
NOLINT(cppcoreguidelines-prefer-member-initializer)
+  if (!llama_model_) {
+    throw Exception(ExceptionType::PROCESS_SCHEDULE_EXCEPTION, 
fmt::format("Failed to load model from '{}'", model_path.string()));
+  }
+
+  llama_context_params ctx_params = llama_context_default_params();
+  ctx_params.n_ctx = llama_ctx_params.n_ctx;
+  ctx_params.n_batch = llama_ctx_params.n_batch;
+  ctx_params.n_ubatch = llama_ctx_params.n_ubatch;
+  ctx_params.n_seq_max = llama_ctx_params.n_seq_max;
+  ctx_params.n_threads = llama_ctx_params.n_threads;
+  ctx_params.n_threads_batch = llama_ctx_params.n_threads_batch;
+  ctx_params.flash_attn = false;
+  llama_ctx_ = llama_init_from_model(llama_model_, ctx_params);
+
+  auto sparams = llama_sampler_chain_default_params();
+  llama_sampler_ = llama_sampler_chain_init(sparams);
+
+  if (llama_sampler_params.min_p) {
+    llama_sampler_chain_add(llama_sampler_, 
llama_sampler_init_min_p(*llama_sampler_params.min_p, 
llama_sampler_params.min_keep));
+  }
+  if (llama_sampler_params.top_k) {
+    llama_sampler_chain_add(llama_sampler_, 
llama_sampler_init_top_k(*llama_sampler_params.top_k));
+  }
+  if (llama_sampler_params.top_p) {
+    llama_sampler_chain_add(llama_sampler_, 
llama_sampler_init_top_p(*llama_sampler_params.top_p, 
llama_sampler_params.min_keep));
+  }
+  if (llama_sampler_params.temperature) {
+    llama_sampler_chain_add(llama_sampler_, 
llama_sampler_init_temp(*llama_sampler_params.temperature));
+  }
+  llama_sampler_chain_add(llama_sampler_, 
llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+}
+
+DefaultLlamaContext::~DefaultLlamaContext() {
+  llama_sampler_free(llama_sampler_);
+  llama_sampler_ = nullptr;
+  llama_free(llama_ctx_);
+  llama_ctx_ = nullptr;
+  llama_model_free(llama_model_);
+  llama_model_ = nullptr;
+}
+
+std::optional<std::string> DefaultLlamaContext::applyTemplate(const 
std::vector<LlamaChatMessage>& messages) {
+  std::vector<llama_chat_message> llama_messages;
+  llama_messages.reserve(messages.size());
+  std::transform(messages.begin(), messages.end(), 
std::back_inserter(llama_messages),
+                 [](const LlamaChatMessage& msg) { return 
llama_chat_message{.role = msg.role.c_str(), .content = msg.content.c_str()}; 
});
+  std::string text;
+  text.resize(utils::configuration::DEFAULT_BUFFER_SIZE);
+  const char * chat_template = llama_model_chat_template(llama_model_, 
nullptr);
+  int32_t res_size = llama_chat_apply_template(chat_template, 
llama_messages.data(), llama_messages.size(), true, text.data(), 
gsl::narrow<int32_t>(text.size()));
+  if (res_size < 0) {
+    return std::nullopt;
+  }
+  if (res_size > gsl::narrow<int32_t>(text.size())) {
+    text.resize(res_size);
+    res_size = llama_chat_apply_template(chat_template, llama_messages.data(), 
llama_messages.size(), true, text.data(), gsl::narrow<int32_t>(text.size()));
+    if (res_size < 0) {
+      return std::nullopt;
+    }
+  }
+  text.resize(res_size);
+
+  return text;
+}
+
+nonstd::expected<GenerationResult, std::string> 
DefaultLlamaContext::generate(const std::string& input, 
std::function<void(std::string_view/*token*/)> token_handler) {
+  GenerationResult result{};
+  auto start_time = std::chrono::steady_clock::now();
+  const llama_vocab * vocab = llama_model_get_vocab(llama_model_);
+  std::vector<llama_token> tokenized_input = tokenizeInput(vocab, input);
+  result.num_tokens_in = gsl::narrow<uint64_t>(tokenized_input.size());
+
+  llama_batch batch = llama_batch_get_one(tokenized_input.data(), 
gsl::narrow<int32_t>(tokenized_input.size()));
+  llama_token new_token_id = 0;
+  bool first_token_generated = false;
+  while (true) {
+    int32_t res = llama_decode(llama_ctx_, batch);
+    if (res == 1) {
+      return nonstd::make_unexpected("Could not find a KV slot for the batch 
(try reducing the size of the batch or increase the context)");
+    } else if (res < 0) {
+      return nonstd::make_unexpected("Error occurred while executing llama 
decode");
+    }
+
+    new_token_id = llama_sampler_sample(llama_sampler_, llama_ctx_, -1);
+    if (!first_token_generated) {
+      result.time_to_first_token = 
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now()
 - start_time);
+      first_token_generated = true;
+    }
+
+    if (llama_vocab_is_eog(vocab, new_token_id)) {
+      break;
+    }
+
+    ++result.num_tokens_out;
+    llama_sampler_accept(llama_sampler_, new_token_id);
+
+    std::array<char, 128> buf{};
+    int32_t len = llama_token_to_piece(vocab, new_token_id, buf.data(), 
gsl::narrow<int32_t>(buf.size()), 0, true);
+    if (len < 0) {
+      return nonstd::make_unexpected("Failed to convert token to text");
+    }
+    gsl_Assert(len < 128);
+
+    std::string_view token_str{buf.data(), 
gsl::narrow<std::string_view::size_type>(len)};
+    batch = llama_batch_get_one(&new_token_id, 1);
+    token_handler(token_str);
+  }
+
+  result.tokens_per_second =
+    gsl::narrow<double>(result.num_tokens_out) / 
(gsl::narrow<double>(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now()
 - start_time).count()) / 1000.0);
+  return result;
+}
+
+}  // namespace org::apache::nifi::minifi::extensions::llamacpp::processors
diff --git a/extensions/llamacpp/processors/DefaultLlamaContext.h 
b/extensions/llamacpp/processors/DefaultLlamaContext.h
new file mode 100644
index 000000000..2d2bdf562
--- /dev/null
+++ b/extensions/llamacpp/processors/DefaultLlamaContext.h
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "LlamaContext.h"
+#include "llama.h"
+#include "LlamaBackendInitializer.h"
+
+namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
+
+class DefaultLlamaContext : public LlamaContext {
+ public:
+  DefaultLlamaContext(const std::filesystem::path& model_path, const 
LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& 
llama_ctx_params);
+  DefaultLlamaContext(const DefaultLlamaContext&) = delete;
+  DefaultLlamaContext(DefaultLlamaContext&&) = delete;
+  DefaultLlamaContext& operator=(const DefaultLlamaContext&) = delete;
+  DefaultLlamaContext& operator=(DefaultLlamaContext&&) = delete;
+  ~DefaultLlamaContext() override;
+
+  std::optional<std::string> applyTemplate(const 
std::vector<LlamaChatMessage>& messages) override;
+  nonstd::expected<GenerationResult, std::string> generate(const std::string& 
input, std::function<void(std::string_view/*token*/)> token_handler) override;
+
+ private:
+  const LlamaBackendInitializer& llama_context_initializer_ = 
LlamaBackendInitializer::get();
+  llama_model* llama_model_{};
+  llama_context* llama_ctx_{};
+  llama_sampler* llama_sampler_{};
+};
+
+}  // namespace org::apache::nifi::minifi::extensions::llamacpp::processors
diff --git a/extensions/llamacpp/processors/LlamaBackendInitializer.cpp 
b/extensions/llamacpp/processors/LlamaBackendInitializer.cpp
new file mode 100644
index 000000000..9d4f94b79
--- /dev/null
+++ b/extensions/llamacpp/processors/LlamaBackendInitializer.cpp
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "LlamaBackendInitializer.h"
+#include "llama.h"
+
+namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
+
+LlamaBackendInitializer::LlamaBackendInitializer() {
+  llama_backend_init();
+}
+
+LlamaBackendInitializer::~LlamaBackendInitializer() {
+  llama_backend_free();
+}
+
+}  // namespace org::apache::nifi::minifi::extensions::llamacpp::processors
diff --git a/extensions/llamacpp/processors/LlamaBackendInitializer.h 
b/extensions/llamacpp/processors/LlamaBackendInitializer.h
new file mode 100644
index 000000000..9981e9bb0
--- /dev/null
+++ b/extensions/llamacpp/processors/LlamaBackendInitializer.h
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
+
+class LlamaBackendInitializer {
+ public:
+  static LlamaBackendInitializer& get() {
+    static LlamaBackendInitializer instance;
+    return instance;
+  }
+
+  ~LlamaBackendInitializer();
+  LlamaBackendInitializer(const LlamaBackendInitializer&) = delete;
+  LlamaBackendInitializer& operator=(const LlamaBackendInitializer&) = delete;
+  LlamaBackendInitializer(LlamaBackendInitializer&&) = delete;
+  LlamaBackendInitializer& operator=(LlamaBackendInitializer&&) = delete;
+
+ private:
+  LlamaBackendInitializer();
+};
+
+}  // namespace org::apache::nifi::minifi::extensions::llamacpp::processors
diff --git a/extensions/llamacpp/processors/LlamaContext.h 
b/extensions/llamacpp/processors/LlamaContext.h
new file mode 100644
index 000000000..3c107c2c5
--- /dev/null
+++ b/extensions/llamacpp/processors/LlamaContext.h
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <memory>
+#include <filesystem>
+#include <vector>
+#include <string_view>
+#include <string>
+#include <functional>
+#include <optional>
+#include "utils/expected.h"
+
+namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
+
+struct LlamaChatMessage {
+  std::string role;
+  std::string content;
+};
+
+struct LlamaSamplerParams {
+  std::optional<float> temperature;
+  std::optional<int32_t> top_k;
+  std::optional<float> top_p;
+  std::optional<float> min_p;
+  uint64_t min_keep{};
+};
+
+struct LlamaContextParams {
+  uint32_t n_ctx{};
+  uint32_t n_batch{};
+  uint32_t n_ubatch{};
+  uint32_t n_seq_max{};
+  int32_t n_threads{};
+  int32_t n_threads_batch{};
+};
+
+struct GenerationResult {
+  std::chrono::milliseconds time_to_first_token{};
+  uint64_t num_tokens_in{};
+  uint64_t num_tokens_out{};
+  double tokens_per_second{};
+};
+
+class LlamaContext {
+ public:
+  virtual std::optional<std::string> applyTemplate(const 
std::vector<LlamaChatMessage>& messages) = 0;
+  virtual nonstd::expected<GenerationResult, std::string> generate(const 
std::string& input, std::function<void(std::string_view/*token*/)> 
token_handler) = 0;
+  virtual ~LlamaContext() = default;
+};
+
+}  // namespace org::apache::nifi::minifi::extensions::llamacpp::processors
diff --git a/extensions/llamacpp/processors/RunLlamaCppInference.cpp 
b/extensions/llamacpp/processors/RunLlamaCppInference.cpp
new file mode 100644
index 000000000..89de32eb3
--- /dev/null
+++ b/extensions/llamacpp/processors/RunLlamaCppInference.cpp
@@ -0,0 +1,155 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "RunLlamaCppInference.h"
+#include "core/ProcessContext.h"
+#include "core/ProcessSession.h"
+#include "core/Resource.h"
+#include "Exception.h"
+
+#include "rapidjson/document.h"
+#include "rapidjson/error/en.h"
+#include "LlamaContext.h"
+#include "utils/ProcessorConfigUtils.h"
+#include "DefaultLlamaContext.h"
+
+namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
+
+void RunLlamaCppInference::initialize() {
+  setSupportedProperties(Properties);
+  setSupportedRelationships(Relationships);
+}
+
+void RunLlamaCppInference::onSchedule(core::ProcessContext& context, 
core::ProcessSessionFactory&) {
+  model_path_.clear();
+  model_path_ = utils::parseProperty(context, ModelPath);
+  system_prompt_ = context.getProperty(SystemPrompt).value_or("");
+
+  LlamaSamplerParams llama_sampler_params;
+  llama_sampler_params.temperature = 
utils::parseOptionalFloatProperty(context, Temperature);
+  if (auto top_k = utils::parseOptionalI64Property(context, TopK)) {
+    llama_sampler_params.top_k = gsl::narrow<int32_t>(*top_k);
+  }
+  llama_sampler_params.top_p = utils::parseOptionalFloatProperty(context, 
TopP);
+  llama_sampler_params.min_p = utils::parseOptionalFloatProperty(context, 
MinP);
+  llama_sampler_params.min_keep = utils::parseU64Property(context, MinKeep);
+
+  LlamaContextParams llama_ctx_params;
+  llama_ctx_params.n_ctx = 
gsl::narrow<uint32_t>(utils::parseU64Property(context, TextContextSize));
+  llama_ctx_params.n_batch = 
gsl::narrow<uint32_t>(utils::parseU64Property(context, 
LogicalMaximumBatchSize));
+  llama_ctx_params.n_ubatch = 
gsl::narrow<uint32_t>(utils::parseU64Property(context, 
PhysicalMaximumBatchSize));
+  llama_ctx_params.n_seq_max = 
gsl::narrow<uint32_t>(utils::parseU64Property(context, MaxNumberOfSequences));
+  llama_ctx_params.n_threads = 
gsl::narrow<int32_t>(utils::parseI64Property(context, ThreadsForGeneration));
+  llama_ctx_params.n_threads_batch = 
gsl::narrow<int32_t>(utils::parseI64Property(context, 
ThreadsForBatchProcessing));
+
+  if (llama_context_provider_) {
+    llama_ctx_ = llama_context_provider_(model_path_, llama_sampler_params, 
llama_ctx_params);
+  } else {
+    llama_ctx_ = std::make_unique<DefaultLlamaContext>(model_path_, 
llama_sampler_params, llama_ctx_params);
+  }
+}
+
+void RunLlamaCppInference::increaseTokensIn(uint64_t token_count) {
+  auto* const llamacpp_metrics = 
dynamic_cast<RunLlamaCppInferenceMetrics*>(metrics_.get());
+  gsl_Assert(llamacpp_metrics);
+  llamacpp_metrics->tokens_in += token_count;
+}
+
+void RunLlamaCppInference::increaseTokensOut(uint64_t token_count) {
+  auto* const llamacpp_metrics = 
dynamic_cast<RunLlamaCppInferenceMetrics*>(metrics_.get());
+  gsl_Assert(llamacpp_metrics);
+  llamacpp_metrics->tokens_out += token_count;
+}
+
+void RunLlamaCppInference::onTrigger(core::ProcessContext& context, 
core::ProcessSession& session) {
+  auto flow_file = session.get();
+  if (!flow_file) {
+    context.yield();
+    return;
+  }
+
+  auto prompt = context.getProperty(Prompt, flow_file.get()).value_or("");
+
+  auto read_result = session.readBuffer(flow_file);
+  std::string input_data_and_prompt;
+  if (!read_result.buffer.empty()) {
+    input_data_and_prompt.append("Input data (or flow file content):\n");
+    input_data_and_prompt.append({reinterpret_cast<const 
char*>(read_result.buffer.data()), read_result.buffer.size()});
+    input_data_and_prompt.append("\n\n");
+  }
+  input_data_and_prompt.append(prompt);
+
+  if (input_data_and_prompt.empty()) {
+    logger_->log_error("Input data and prompt are empty");
+    session.transfer(flow_file, Failure);
+    return;
+  }
+
+  auto input = [&] {
+    std::vector<LlamaChatMessage> messages;
+    if (!system_prompt_.empty()) {
+      messages.push_back({.role = "system", .content = system_prompt_});
+    }
+    messages.push_back({.role = "user", .content = input_data_and_prompt});
+
+    return llama_ctx_->applyTemplate(messages);
+  }();
+
+  if (!input) {
+    logger_->log_error("Inference failed with while applying template");
+    session.transfer(flow_file, Failure);
+    return;
+  }
+
+  logger_->log_debug("AI model input: {}", *input);
+
+  auto start_time = std::chrono::steady_clock::now();
+
+  std::string text;
+  auto generation_result = llama_ctx_->generate(*input, [&] (std::string_view 
token) {
+    text += token;
+  });
+
+  auto elapsed_time = 
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now()
 - start_time).count();
+
+  if (!generation_result) {
+    logger_->log_error("Inference failed with generation error: '{}'", 
generation_result.error());
+    session.transfer(flow_file, Failure);
+    return;
+  }
+
+  increaseTokensIn(generation_result->num_tokens_in);
+  increaseTokensOut(generation_result->num_tokens_out);
+
+  logger_->log_debug("Number of tokens generated: {}", 
generation_result->num_tokens_out);
+  logger_->log_debug("AI model inference time: {} ms", elapsed_time);
+  logger_->log_debug("AI model output: {}", text);
+
+  flow_file->setAttribute(LlamaCppTimeToFirstToken.name, 
std::to_string(generation_result->time_to_first_token.count()) + " ms");
+  flow_file->setAttribute(LlamaCppTokensPerSecond.name, fmt::format("{:.2f}", 
generation_result->tokens_per_second));
+
+  session.writeBuffer(flow_file, text);
+  session.transfer(flow_file, Success);
+}
+
+void RunLlamaCppInference::notifyStop() {
+  llama_ctx_.reset();
+}
+
+REGISTER_RESOURCE(RunLlamaCppInference, Processor);
+
+}  // namespace org::apache::nifi::minifi::extensions::llamacpp::processors
diff --git a/extensions/llamacpp/processors/RunLlamaCppInference.h 
b/extensions/llamacpp/processors/RunLlamaCppInference.h
new file mode 100644
index 000000000..a9fd90a02
--- /dev/null
+++ b/extensions/llamacpp/processors/RunLlamaCppInference.h
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <mutex>
+#include <atomic>
+
+#include "core/Processor.h"
+#include "core/logging/LoggerFactory.h"
+#include "core/PropertyDefinitionBuilder.h"
+#include "LlamaContext.h"
+#include "core/ProcessorMetrics.h"
+
+namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
+
+using LlamaContextProvider =
+  std::function<std::unique_ptr<LlamaContext>(const std::filesystem::path& 
model_path, const LlamaSamplerParams& llama_sampler_params, const 
LlamaContextParams& llama_ctx_params)>;
+
+class RunLlamaCppInferenceMetrics : public core::ProcessorMetricsImpl {
+ public:
+  explicit RunLlamaCppInferenceMetrics(const core::Processor& source_processor)
+  : core::ProcessorMetricsImpl(source_processor) {
+  }
+
+  std::vector<state::response::SerializedResponseNode> serialize() override {
+    auto resp = core::ProcessorMetricsImpl::serialize();
+    auto& root_node = resp[0];
+
+    state::response::SerializedResponseNode tokens_in_node{"TokensIn", 
tokens_in.load()};
+    root_node.children.push_back(tokens_in_node);
+
+    state::response::SerializedResponseNode tokens_out_node{"TokensOut", 
tokens_out.load()};
+    root_node.children.push_back(tokens_out_node);
+
+    return resp;
+  }
+
+  std::vector<state::PublishedMetric> calculateMetrics() override {
+    auto metrics = core::ProcessorMetricsImpl::calculateMetrics();
+    metrics.push_back({"tokens_in", static_cast<double>(tokens_in.load()), 
getCommonLabels()});
+    metrics.push_back({"tokens_out", static_cast<double>(tokens_out.load()), 
getCommonLabels()});
+    return metrics;
+  }
+
+  std::atomic<uint64_t> tokens_in{0};
+  std::atomic<uint64_t> tokens_out{0};
+};
+
+class RunLlamaCppInference : public core::ProcessorImpl {
+ public:
+  explicit RunLlamaCppInference(std::string_view name, const 
utils::Identifier& uuid = {}, LlamaContextProvider llama_context_provider = {})
+      : core::ProcessorImpl(name, uuid),
+        llama_context_provider_(std::move(llama_context_provider)) {
+    metrics_ = 
gsl::make_not_null(std::make_shared<RunLlamaCppInferenceMetrics>(*this));
+  }
+  ~RunLlamaCppInference() override = default;
+
+  EXTENSIONAPI static constexpr const char* Description = "LlamaCpp processor 
to use llama.cpp library for running language model inference. "
+      "The inference will be based on the System Prompt and the Prompt 
property values, together with the content of the incoming flow file. "
+      "In the Prompt, the content of the incoming flow file can be referred to 
as 'the input data' or 'the flow file content'.";
+
+  EXTENSIONAPI static constexpr auto ModelPath = 
core::PropertyDefinitionBuilder<>::createProperty("Model Path")
+      .withDescription("The filesystem path of the model file in gguf format.")
+      .isRequired(true)
+      .build();
+  EXTENSIONAPI static constexpr auto Temperature = 
core::PropertyDefinitionBuilder<>::createProperty("Temperature")
+      .withDescription("The temperature to use for sampling.")
+      .withDefaultValue("0.8")
+      .build();
+  EXTENSIONAPI static constexpr auto TopK = 
core::PropertyDefinitionBuilder<>::createProperty("Top K")
+      .withDescription("Limit the next token selection to the K most probable 
tokens. Set <= 0 value to use vocab size.")
+      .withDefaultValue("40")
+      .build();
+  EXTENSIONAPI static constexpr auto TopP = 
core::PropertyDefinitionBuilder<>::createProperty("Top P")
+      .withDescription("Limit the next token selection to a subset of tokens 
with a cumulative probability above a threshold P. 1.0 = disabled.")
+      .withDefaultValue("0.9")
+      .build();
+  EXTENSIONAPI static constexpr auto MinP = 
core::PropertyDefinitionBuilder<>::createProperty("Min P")
+      .withDescription("Sets a minimum base probability threshold for token 
selection. 0.0 = disabled.")
+      .build();
+  EXTENSIONAPI static constexpr auto MinKeep = 
core::PropertyDefinitionBuilder<>::createProperty("Min Keep")
+      .withDescription("If greater than 0, force samplers to return N possible 
tokens at minimum.")
+      .isRequired(true)
+      
.withValidator(core::StandardPropertyValidators::UNSIGNED_INTEGER_VALIDATOR)
+      .withDefaultValue("0")
+      .build();
+  EXTENSIONAPI static constexpr auto TextContextSize = 
core::PropertyDefinitionBuilder<>::createProperty("Text Context Size")
+      .withDescription("Size of the text context, use 0 to use size set in 
model.")
+      .isRequired(true)
+      
.withValidator(core::StandardPropertyValidators::UNSIGNED_INTEGER_VALIDATOR)
+      .withDefaultValue("4096")
+      .build();
+  EXTENSIONAPI static constexpr auto LogicalMaximumBatchSize = 
core::PropertyDefinitionBuilder<>::createProperty("Logical Maximum Batch Size")
+      .withDescription("Logical maximum batch size that can be submitted to 
the llama.cpp decode function.")
+      .isRequired(true)
+      
.withValidator(core::StandardPropertyValidators::UNSIGNED_INTEGER_VALIDATOR)
+      .withDefaultValue("2048")
+      .build();
+  EXTENSIONAPI static constexpr auto PhysicalMaximumBatchSize = 
core::PropertyDefinitionBuilder<>::createProperty("Physical Maximum Batch Size")
+      .withDescription("Physical maximum batch size.")
+      .isRequired(true)
+      
.withValidator(core::StandardPropertyValidators::UNSIGNED_INTEGER_VALIDATOR)
+      .withDefaultValue("512")
+      .build();
+  EXTENSIONAPI static constexpr auto MaxNumberOfSequences = 
core::PropertyDefinitionBuilder<>::createProperty("Max Number Of Sequences")
+      .withDescription("Maximum number of sequences (i.e. distinct states for 
recurrent models).")
+      .isRequired(true)
+      
.withValidator(core::StandardPropertyValidators::UNSIGNED_INTEGER_VALIDATOR)
+      .withDefaultValue("1")
+      .build();
+  EXTENSIONAPI static constexpr auto ThreadsForGeneration = 
core::PropertyDefinitionBuilder<>::createProperty("Threads For Generation")
+      .withDescription("Number of threads to use for generation.")
+      .isRequired(true)
+      .withValidator(core::StandardPropertyValidators::INTEGER_VALIDATOR)
+      .withDefaultValue("4")
+      .build();
+  EXTENSIONAPI static constexpr auto ThreadsForBatchProcessing = 
core::PropertyDefinitionBuilder<>::createProperty("Threads For Batch 
Processing")
+      .withDescription("Number of threads to use for batch processing.")
+      .isRequired(true)
+      .withValidator(core::StandardPropertyValidators::INTEGER_VALIDATOR)
+      .withDefaultValue("4")
+      .build();
+  EXTENSIONAPI static constexpr auto Prompt = 
core::PropertyDefinitionBuilder<>::createProperty("Prompt")
+      .withDescription("The user prompt for the inference.")
+      .supportsExpressionLanguage(true)
+      .build();
+  EXTENSIONAPI static constexpr auto SystemPrompt = 
core::PropertyDefinitionBuilder<>::createProperty("System Prompt")
+      .withDescription("The system prompt for the inference.")
+      .withDefaultValue("You are a helpful assistant. You are given a question 
with some possible input data otherwise called flow file content. "
+                        "You are expected to generate a response based on the 
question and the input data.")
+      .build();
+
+  EXTENSIONAPI static constexpr auto Properties = 
std::to_array<core::PropertyReference>({
+    ModelPath,
+    Temperature,
+    TopK,
+    TopP,
+    MinP,
+    MinKeep,
+    TextContextSize,
+    LogicalMaximumBatchSize,
+    PhysicalMaximumBatchSize,
+    MaxNumberOfSequences,
+    ThreadsForGeneration,
+    ThreadsForBatchProcessing,
+    Prompt,
+    SystemPrompt
+  });
+
+
+  EXTENSIONAPI static constexpr auto Success = 
core::RelationshipDefinition{"success", "Generated results from the model"};
+  EXTENSIONAPI static constexpr auto Failure = 
core::RelationshipDefinition{"failure", "Generation failed"};
+  EXTENSIONAPI static constexpr auto Relationships = std::array{Success, 
Failure};
+
+  EXTENSIONAPI static constexpr auto LlamaCppTimeToFirstToken = 
core::OutputAttributeDefinition<>{"llamacpp.time.to.first.token", {Success}, 
"Time to first token generated in milliseconds."};
+  EXTENSIONAPI static constexpr auto LlamaCppTokensPerSecond = 
core::OutputAttributeDefinition<>{"llamacpp.tokens.per.second", {Success}, 
"Tokens generated per second."};
+  EXTENSIONAPI static constexpr auto OutputAttributes = 
std::to_array<core::OutputAttributeReference>({LlamaCppTimeToFirstToken, 
LlamaCppTokensPerSecond});
+
+  EXTENSIONAPI static constexpr bool SupportsDynamicProperties = false;
+  EXTENSIONAPI static constexpr bool SupportsDynamicRelationships = false;
+  EXTENSIONAPI static constexpr core::annotation::Input InputRequirement = 
core::annotation::Input::INPUT_REQUIRED;
+  EXTENSIONAPI static constexpr bool IsSingleThreaded = true;
+
+  ADD_COMMON_VIRTUAL_FUNCTIONS_FOR_PROCESSORS
+
+  void onSchedule(core::ProcessContext& context, core::ProcessSessionFactory& 
session_factory) override;
+  void onTrigger(core::ProcessContext& context, core::ProcessSession& session) 
override;
+  void initialize() override;
+  void notifyStop() override;
+
+ private:
+  void increaseTokensIn(uint64_t token_count);
+  void increaseTokensOut(uint64_t token_count);
+  std::shared_ptr<core::logging::Logger> logger_ = 
core::logging::LoggerFactory<RunLlamaCppInference>::getLogger(uuid_);
+
+  std::string model_path_;
+  std::string system_prompt_;
+
+  LlamaContextProvider llama_context_provider_;
+  std::unique_ptr<LlamaContext> llama_ctx_;
+};
+
+}  // namespace org::apache::nifi::minifi::extensions::llamacpp::processors
diff --git a/extensions/llamacpp/tests/CMakeLists.txt 
b/extensions/llamacpp/tests/CMakeLists.txt
new file mode 100644
index 000000000..d1cd79e33
--- /dev/null
+++ b/extensions/llamacpp/tests/CMakeLists.txt
@@ -0,0 +1,37 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+file(GLOB LLAMACPP_TESTS  "*.cpp")
+
+SET(EXTENSIONS_TEST_COUNT 0)
+FOREACH(testfile ${LLAMACPP_TESTS})
+    get_filename_component(testfilename "${testfile}" NAME_WE)
+    add_minifi_executable(${testfilename} "${testfile}")
+    target_include_directories(${testfilename} BEFORE PRIVATE 
"${CMAKE_SOURCE_DIR}/libminifi/include")
+    target_include_directories(${testfilename} BEFORE PRIVATE 
"${CMAKE_SOURCE_DIR}/extensions/llamacpp/processors")
+    createTests(${testfilename})
+    target_link_libraries(${testfilename} Catch2WithMain)
+    target_link_libraries(${testfilename} minifi-llamacpp)
+    target_link_libraries(${testfilename} minifi-standard-processors)
+    target_compile_definitions("${testfilename}" PRIVATE 
TZ_DATA_DIR="${CMAKE_BINARY_DIR}/tzdata")
+
+    MATH(EXPR EXTENSIONS_TEST_COUNT "${EXTENSIONS_TEST_COUNT}+1")
+    add_test(NAME ${testfilename} COMMAND ${testfilename} WORKING_DIRECTORY 
${TEST_DIR})
+ENDFOREACH()
+message("-- Finished building ${EXTENSIONS_TEST_COUNT} llama.cpp related test 
file(s)...")
diff --git a/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp 
b/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp
new file mode 100644
index 000000000..fe78109af
--- /dev/null
+++ b/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp
@@ -0,0 +1,345 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "unit/TestBase.h"
+#include "unit/Catch.h"
+#include "RunLlamaCppInference.h"
+#include "unit/SingleProcessorTestController.h"
+#include "core/FlowFile.h"
+
+namespace org::apache::nifi::minifi::extensions::llamacpp::test {
+
+class MockLlamaContext : public processors::LlamaContext {
+ public:
+  std::optional<std::string> applyTemplate(const 
std::vector<processors::LlamaChatMessage>& messages) override {
+    if (fail_apply_template_) {
+      return std::nullopt;
+    }
+    messages_ = messages;
+    return "Test input";
+  }
+
+  nonstd::expected<processors::GenerationResult, std::string> generate(const 
std::string& input, std::function<void(std::string_view/*token*/)> 
token_handler) override {
+    if (fail_generation_) {
+      return nonstd::make_unexpected("Generation failed");
+    }
+    processors::GenerationResult result;
+    input_ = input;
+    token_handler("Test ");
+    token_handler("generated");
+    token_handler(" content");
+    result.time_to_first_token = std::chrono::milliseconds(100);
+    result.num_tokens_in = 10;
+    result.num_tokens_out = 3;
+    result.tokens_per_second = 2.0;
+    return result;
+  }
+
+  [[nodiscard]] const std::vector<processors::LlamaChatMessage>& getMessages() 
const {
+    return messages_;
+  }
+
+  [[nodiscard]] const std::string& getInput() const {
+    return input_;
+  }
+
+  void setGenerationFailure() {
+    fail_generation_ = true;
+  }
+
+  void setApplyTemplateFailure() {
+    fail_apply_template_ = true;
+  }
+
+ private:
+  bool fail_generation_{false};
+  bool fail_apply_template_{false};
+  std::vector<processors::LlamaChatMessage> messages_;
+  std::string input_;
+};
+
+TEST_CASE("Prompt is generated correctly with default parameters") {
+  auto mock_llama_context = std::make_unique<MockLlamaContext>();
+  auto mock_llama_context_ptr = mock_llama_context.get();
+  std::filesystem::path test_model_path;
+  processors::LlamaSamplerParams test_sampler_params;
+  processors::LlamaContextParams test_context_params;
+  minifi::test::SingleProcessorTestController 
controller(std::make_unique<processors::RunLlamaCppInference>("RunLlamaCppInference",
 utils::Identifier(),
+    [&](const std::filesystem::path& model_path, const 
processors::LlamaSamplerParams& sampler_params, const 
processors::LlamaContextParams& context_params) {
+      test_model_path = model_path;
+      test_sampler_params = sampler_params;
+      test_context_params = context_params;
+      return std::move(mock_llama_context);
+    }));
+  
LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name,
 "Dummy model");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name,
 "Question: What is the answer to life, the universe and everything?");
+
+  auto results = controller.trigger(minifi::test::InputFlowFileData{.content = 
"42", .attributes = {}});
+  CHECK(test_model_path == "Dummy model");
+  CHECK(test_sampler_params.temperature == 0.8F);
+  CHECK(test_sampler_params.top_k == 40);
+  CHECK(test_sampler_params.top_p == 0.9F);
+  CHECK(test_sampler_params.min_p == std::nullopt);
+  CHECK(test_sampler_params.min_keep == 0);
+  CHECK(test_context_params.n_ctx == 4096);
+  CHECK(test_context_params.n_batch == 2048);
+  CHECK(test_context_params.n_ubatch == 512);
+  CHECK(test_context_params.n_seq_max == 1);
+  CHECK(test_context_params.n_threads == 4);
+  CHECK(test_context_params.n_threads_batch == 4);
+
+  REQUIRE(results.at(processors::RunLlamaCppInference::Success).size() == 1);
+  auto& output_flow_file = 
results.at(processors::RunLlamaCppInference::Success)[0];
+  
CHECK(*output_flow_file->getAttribute(processors::RunLlamaCppInference::LlamaCppTimeToFirstToken.name)
 == "100 ms");
+  
CHECK(*output_flow_file->getAttribute(processors::RunLlamaCppInference::LlamaCppTokensPerSecond.name)
 == "2.00");
+  CHECK(controller.plan->getContent(output_flow_file) == "Test generated 
content");
+  CHECK(mock_llama_context_ptr->getInput() == "Test input");
+  REQUIRE(mock_llama_context_ptr->getMessages().size() == 2);
+  CHECK(mock_llama_context_ptr->getMessages()[0].role == "system");
+  CHECK(mock_llama_context_ptr->getMessages()[0].content == "You are a helpful 
assistant. You are given a question with some possible input data otherwise 
called flow file content. "
+                                                            "You are expected 
to generate a response based on the question and the input data.");
+  CHECK(mock_llama_context_ptr->getMessages()[1].role == "user");
+  CHECK(mock_llama_context_ptr->getMessages()[1].content == "Input data (or 
flow file content):\n42\n\nQuestion: What is the answer to life, the universe 
and everything?");
+}
+
+TEST_CASE("Prompt is generated correctly with custom parameters") {
+  auto mock_llama_context = std::make_unique<MockLlamaContext>();
+  auto mock_llama_context_ptr = mock_llama_context.get();
+  std::filesystem::path test_model_path;
+  processors::LlamaSamplerParams test_sampler_params;
+  processors::LlamaContextParams test_context_params;
+  minifi::test::SingleProcessorTestController 
controller(std::make_unique<processors::RunLlamaCppInference>("RunLlamaCppInference",
 utils::Identifier(),
+    [&](const std::filesystem::path& model_path, const 
processors::LlamaSamplerParams& sampler_params, const 
processors::LlamaContextParams& context_params) {
+      test_model_path = model_path;
+      test_sampler_params = sampler_params;
+      test_context_params = context_params;
+      return std::move(mock_llama_context);
+    }));
+  
LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name,
 "/path/to/model");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name,
 "Question: What is the answer to life, the universe and everything?");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Temperature.name,
 "0.4");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::TopK.name,
 "20");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::TopP.name,
 "");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::MinP.name,
 "0.1");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::MinKeep.name,
 "1");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::TextContextSize.name,
 "4096");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::LogicalMaximumBatchSize.name,
 "1024");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::PhysicalMaximumBatchSize.name,
 "796");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::MaxNumberOfSequences.name,
 "2");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ThreadsForGeneration.name,
 "12");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ThreadsForBatchProcessing.name,
 "8");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::SystemPrompt.name,
 "Whatever");
+
+  auto results = controller.trigger(minifi::test::InputFlowFileData{.content = 
"42", .attributes = {}});
+  CHECK(test_model_path == "/path/to/model");
+  CHECK(test_sampler_params.temperature == 0.4F);
+  CHECK(test_sampler_params.top_k == 20);
+  CHECK(test_sampler_params.top_p == std::nullopt);
+  CHECK(test_sampler_params.min_p == 0.1F);
+  CHECK(test_sampler_params.min_keep == 1);
+  CHECK(test_context_params.n_ctx == 4096);
+  CHECK(test_context_params.n_batch == 1024);
+  CHECK(test_context_params.n_ubatch == 796);
+  CHECK(test_context_params.n_seq_max == 2);
+  CHECK(test_context_params.n_threads == 12);
+  CHECK(test_context_params.n_threads_batch == 8);
+
+  REQUIRE(results.at(processors::RunLlamaCppInference::Success).size() == 1);
+  auto& output_flow_file = 
results.at(processors::RunLlamaCppInference::Success)[0];
+  CHECK(controller.plan->getContent(output_flow_file) == "Test generated 
content");
+  CHECK(mock_llama_context_ptr->getInput() == "Test input");
+  REQUIRE(mock_llama_context_ptr->getMessages().size() == 2);
+  CHECK(mock_llama_context_ptr->getMessages()[0].role == "system");
+  CHECK(mock_llama_context_ptr->getMessages()[0].content == "Whatever");
+  CHECK(mock_llama_context_ptr->getMessages()[1].role == "user");
+  CHECK(mock_llama_context_ptr->getMessages()[1].content == "Input data (or 
flow file content):\n42\n\nQuestion: What is the answer to life, the universe 
and everything?");
+}
+
+TEST_CASE("Empty flow file does not include input data in prompt") {
+  auto mock_llama_context = std::make_unique<MockLlamaContext>();
+  auto mock_llama_context_ptr = mock_llama_context.get();
+  minifi::test::SingleProcessorTestController 
controller(std::make_unique<processors::RunLlamaCppInference>("RunLlamaCppInference",
 utils::Identifier(),
+    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, 
const processors::LlamaContextParams&) {
+      return std::move(mock_llama_context);
+    }));
+  
LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name,
 "Dummy model");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name,
 "Question: What is the answer to life, the universe and everything?");
+
+  auto results = controller.trigger(minifi::test::InputFlowFileData{.content = 
"", .attributes = {}});
+
+  REQUIRE(results.at(processors::RunLlamaCppInference::Success).size() == 1);
+  auto& output_flow_file = 
results.at(processors::RunLlamaCppInference::Success)[0];
+  CHECK(controller.plan->getContent(output_flow_file) == "Test generated 
content");
+  CHECK(mock_llama_context_ptr->getInput() == "Test input");
+  REQUIRE(mock_llama_context_ptr->getMessages().size() == 2);
+  CHECK(mock_llama_context_ptr->getMessages()[0].role == "system");
+  CHECK(mock_llama_context_ptr->getMessages()[0].content == "You are a helpful 
assistant. You are given a question with some possible input data otherwise 
called flow file content. "
+                                                            "You are expected 
to generate a response based on the question and the input data.");
+  CHECK(mock_llama_context_ptr->getMessages()[1].role == "user");
+  CHECK(mock_llama_context_ptr->getMessages()[1].content == "Question: What is 
the answer to life, the universe and everything?");
+}
+
+TEST_CASE("Invalid values for optional double type properties throw 
exception") {
+  minifi::test::SingleProcessorTestController 
controller(std::make_unique<processors::RunLlamaCppInference>("RunLlamaCppInference",
 utils::Identifier(),
+    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, 
const processors::LlamaContextParams&) {
+      return std::make_unique<MockLlamaContext>();
+    }));
+  
LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name,
 "Dummy model");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name,
 "Question: What is the answer to life, the universe and everything?");
+
+  std::string property_name;
+  SECTION("Invalid value for Temperature property") {
+    
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Temperature.name,
 "invalid_value");
+    property_name = processors::RunLlamaCppInference::Temperature.name;
+  }
+  SECTION("Invalid value for Top P property") {
+    
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::TopP.name,
 "invalid_value");
+    property_name = processors::RunLlamaCppInference::TopP.name;
+  }
+  SECTION("Invalid value for Min P property") {
+    
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::MinP.name,
 "invalid_value");
+    property_name = processors::RunLlamaCppInference::MinP.name;
+  }
+
+  
REQUIRE_THROWS_WITH(controller.trigger(minifi::test::InputFlowFileData{.content 
= "42", .attributes = {}}),
+                      fmt::format("Expected parsable float from 
RunLlamaCppInference::{}: parsing error: GeneralParsingError (0)", 
property_name));
+}
+
+TEST_CASE("Top K property empty and invalid values are handled properly") {
+  std::optional<int32_t> test_top_k = 0;
+  minifi::test::SingleProcessorTestController 
controller(std::make_unique<processors::RunLlamaCppInference>("RunLlamaCppInference",
 utils::Identifier(),
+    [&](const std::filesystem::path&, const processors::LlamaSamplerParams& 
sampler_params, const processors::LlamaContextParams&) {
+      test_top_k = sampler_params.top_k;
+      return std::make_unique<MockLlamaContext>();
+    }));
+  
LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name,
 "Dummy model");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name,
 "Question: What is the answer to life, the universe and everything?");
+  SECTION("Empty value for Top K property") {
+    
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::TopK.name,
 "");
+    auto results = controller.trigger(minifi::test::InputFlowFileData{.content 
= "42", .attributes = {}});
+    REQUIRE(test_top_k == std::nullopt);
+  }
+  SECTION("Invalid value for Top K property") {
+    
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::TopK.name,
 "invalid_value");
+    
REQUIRE_THROWS_WITH(controller.trigger(minifi::test::InputFlowFileData{.content 
= "42", .attributes = {}}),
+                        "Expected parsable int64_t from 
RunLlamaCppInference::Top K: parsing error: GeneralParsingError (0)");
+  }
+}
+
+TEST_CASE("Error handling during generation and applying template") {
+  auto mock_llama_context = std::make_unique<MockLlamaContext>();
+
+  SECTION("Generation fails with error") {
+    mock_llama_context->setGenerationFailure();
+  }
+
+  SECTION("Applying template fails with error") {
+    mock_llama_context->setApplyTemplateFailure();
+  }
+
+  minifi::test::SingleProcessorTestController 
controller(std::make_unique<processors::RunLlamaCppInference>("RunLlamaCppInference",
 utils::Identifier(),
+    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, 
const processors::LlamaContextParams&) {
+      return std::move(mock_llama_context);
+    }));
+  
LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name,
 "/path/to/model");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name,
 "Question: What is the answer to life, the universe and everything?");
+
+  auto results = controller.trigger(minifi::test::InputFlowFileData{.content = 
"42", .attributes = {}});
+
+  REQUIRE(results.at(processors::RunLlamaCppInference::Success).empty());
+  REQUIRE(results.at(processors::RunLlamaCppInference::Failure).size() == 1);
+  auto& output_flow_file = 
results.at(processors::RunLlamaCppInference::Failure)[0];
+  CHECK(controller.plan->getContent(output_flow_file) == "42");
+}
+
+TEST_CASE("Route flow file to failure when prompt and input data is empty") {
+  minifi::test::SingleProcessorTestController 
controller(std::make_unique<processors::RunLlamaCppInference>("RunLlamaCppInference",
 utils::Identifier(),
+    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, 
const processors::LlamaContextParams&) {
+      return std::make_unique<MockLlamaContext>();
+    }));
+  
LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name,
 "/path/to/model");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name,
 "");
+
+  auto results = controller.trigger(minifi::test::InputFlowFileData{.content = 
"", .attributes = {}});
+
+  REQUIRE(results.at(processors::RunLlamaCppInference::Success).empty());
+  REQUIRE(results.at(processors::RunLlamaCppInference::Failure).size() == 1);
+  auto& output_flow_file = 
results.at(processors::RunLlamaCppInference::Failure)[0];
+  CHECK(controller.plan->getContent(output_flow_file).empty());
+}
+
+TEST_CASE("System prompt is optional") {
+  auto mock_llama_context = std::make_unique<MockLlamaContext>();
+  auto mock_llama_context_ptr = mock_llama_context.get();
+  minifi::test::SingleProcessorTestController 
controller(std::make_unique<processors::RunLlamaCppInference>("RunLlamaCppInference",
 utils::Identifier(),
+    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, 
const processors::LlamaContextParams&) {
+      return std::move(mock_llama_context);
+    }));
+  
LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name,
 "Dummy model");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name,
 "Question: What is the answer to life, the universe and everything?");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::SystemPrompt.name,
 "");
+
+  auto results = controller.trigger(minifi::test::InputFlowFileData{.content = 
"42", .attributes = {}});
+
+  REQUIRE(results.at(processors::RunLlamaCppInference::Success).size() == 1);
+  auto& output_flow_file = 
results.at(processors::RunLlamaCppInference::Success)[0];
+  CHECK(controller.plan->getContent(output_flow_file) == "Test generated 
content");
+  CHECK(mock_llama_context_ptr->getInput() == "Test input");
+  REQUIRE(mock_llama_context_ptr->getMessages().size() == 1);
+  CHECK(mock_llama_context_ptr->getMessages()[0].role == "user");
+  CHECK(mock_llama_context_ptr->getMessages()[0].content == "Input data (or 
flow file content):\n42\n\nQuestion: What is the answer to life, the universe 
and everything?");
+}
+
+TEST_CASE("Test output metrics") {
+  auto processor = 
std::make_unique<processors::RunLlamaCppInference>("RunLlamaCppInference", 
utils::Identifier(),
+    [&](const std::filesystem::path&, const processors::LlamaSamplerParams&, 
const processors::LlamaContextParams&) {
+      return std::make_unique<MockLlamaContext>();
+    });
+  auto processor_metrics = processor->getMetrics();
+  minifi::test::SingleProcessorTestController controller(std::move(processor));
+  
LogTestController::getInstance().setTrace<processors::RunLlamaCppInference>();
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::ModelPath.name,
 "Dummy model");
+  
controller.getProcessor()->setProperty(processors::RunLlamaCppInference::Prompt.name,
 "Question: What is the answer to life, the universe and everything?");
+
+  controller.trigger(minifi::test::InputFlowFileData{.content = "42", 
.attributes = {}});
+  auto results = controller.trigger(minifi::test::InputFlowFileData{.content = 
"42", .attributes = {}});
+
+  REQUIRE(results.at(processors::RunLlamaCppInference::Success).size() == 1);
+  auto prometheus_metrics = processor_metrics->calculateMetrics();
+  REQUIRE(prometheus_metrics.size() >= 2);
+  CHECK(prometheus_metrics[prometheus_metrics.size() - 2].name == "tokens_in");
+  CHECK(prometheus_metrics[prometheus_metrics.size() - 2].value == 20);
+  CHECK(prometheus_metrics[prometheus_metrics.size() - 1].name == 
"tokens_out");
+  CHECK(prometheus_metrics[prometheus_metrics.size() - 1].value == 6);
+  auto c2_metrics = processor_metrics->serialize();
+  REQUIRE_FALSE(c2_metrics.empty());
+  REQUIRE(c2_metrics[0].children.size() >= 2);
+  CHECK(c2_metrics[0].children[c2_metrics[0].children.size() - 2].name == 
"TokensIn");
+  CHECK(c2_metrics[0].children[c2_metrics[0].children.size() - 
2].value.to_string() == "20");
+  CHECK(c2_metrics[0].children[c2_metrics[0].children.size() - 1].name == 
"TokensOut");
+  CHECK(c2_metrics[0].children[c2_metrics[0].children.size() - 
1].value.to_string() == "6");
+}
+
+}  // namespace org::apache::nifi::minifi::extensions::llamacpp::test
diff --git a/thirdparty/llamacpp/lu8_macro_fix.patch 
b/thirdparty/llamacpp/lu8_macro_fix.patch
new file mode 100644
index 000000000..a1b92d28b
--- /dev/null
+++ b/thirdparty/llamacpp/lu8_macro_fix.patch
@@ -0,0 +1,17 @@
+diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
+index dd27a381..47550954 100644
+--- a/src/llama-chat.cpp
++++ b/src/llama-chat.cpp
+@@ -6,11 +6,7 @@
+ #include <sstream>
+ #include <algorithm>
+ 
+-#if __cplusplus >= 202000L
+-    #define LU8(x) (const char*)(u8##x)
+-#else
+-    #define LU8(x) u8##x
+-#endif
++#define LU8(x) reinterpret_cast<const char*>(u8##x)
+ 
+ // trim whitespace from the beginning and end of a string
+ static std::string trim(const std::string & str) {
diff --git a/utils/include/core/ProcessorMetrics.h 
b/utils/include/core/ProcessorMetrics.h
index 030d2ae34..5de882d8c 100644
--- a/utils/include/core/ProcessorMetrics.h
+++ b/utils/include/core/ProcessorMetrics.h
@@ -25,6 +25,7 @@
 
 #include "core/state/nodes/ResponseNode.h"
 #include "minifi-cpp/core/ProcessorMetrics.h"
+#include "core/state/Value.h"
 
 namespace org::apache::nifi::minifi::core {
 
diff --git a/utils/include/utils/ParsingUtils.h 
b/utils/include/utils/ParsingUtils.h
index 6657afa7a..7aa8d99ec 100644
--- a/utils/include/utils/ParsingUtils.h
+++ b/utils/include/utils/ParsingUtils.h
@@ -42,6 +42,7 @@ nonstd::expected<uint64_t, std::error_code> 
parseDataSize(std::string_view input
 
 nonstd::expected<uint32_t, std::error_code> 
parseUnixOctalPermissions(std::string_view input);
 
+nonstd::expected<float, std::error_code> parseFloat(std::string_view input);
 
 template<std::integral T>
 nonstd::expected<T, std::error_code> parseIntegralMinMax(const 
std::string_view input, const T minimum, const T maximum) {
@@ -91,4 +92,4 @@ nonstd::expected<T, std::error_code> parseEnum(const 
std::string_view input) {
   return *result;
 }
 
-}  // namespace org::apache::nifi::minifi::parsing
\ No newline at end of file
+}  // namespace org::apache::nifi::minifi::parsing
diff --git a/utils/src/utils/ParsingUtils.cpp b/utils/src/utils/ParsingUtils.cpp
index cf907d0e7..1e3fd67e7 100644
--- a/utils/src/utils/ParsingUtils.cpp
+++ b/utils/src/utils/ParsingUtils.cpp
@@ -126,4 +126,12 @@ nonstd::expected<uint32_t, std::error_code> 
parseUnixOctalPermissions(const std:
   return result;
 }
 
+nonstd::expected<float, std::error_code> parseFloat(std::string_view input) {
+  try {
+    return std::stof(std::string{input});
+  } catch(const std::exception&) {
+    return 
nonstd::make_unexpected(core::ParsingErrorCode::GeneralParsingError);
+  }
+}
+
 }  // namespace org::apache::nifi::minifi::parsing

(nifi-minifi-cpp) 01/05: MINIFICPP-2556 Create llama.cpp processor for LLM inference

Reply via email to