(seatunnel) branch dev updated: [Feature][Transforms-V2] LLM transforms Support custom field name (#7640)

wanghailin Fri, 13 Sep 2024 22:52:12 -0700

This is an automated email from the ASF dual-hosted git repository.

wanghailin pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/seatunnel.git



The following commit(s) were added to refs/heads/dev by this push:
     new 6da7491efa [Feature][Transforms-V2] LLM transforms Support custom 
field name (#7640)
6da7491efa is described below

commit 6da7491efa3959ba6a5aa0ccf6f4ea9c66e80d0b
Author: zhangdonghao <[email protected]>
AuthorDate: Sat Sep 14 13:52:00 2024 +0800

    [Feature][Transforms-V2] LLM transforms Support custom field name (#7640)
---
 docs/en/transform-v2/llm.md                        |  9 ++-
 docs/zh/transform-v2/llm.md                        | 31 +++++----
 .../apache/seatunnel/e2e/transform/TestLLMIT.java  |  8 +++
 .../llm_openai_transform_custom_output_name.conf   | 76 ++++++++++++++++++++++
 .../transform/nlpmodel/llm/LLMTransform.java       | 10 ++-
 .../transform/nlpmodel/llm/LLMTransformConfig.java |  6 ++
 6 files changed, 124 insertions(+), 16 deletions(-)

diff --git a/docs/en/transform-v2/llm.md b/docs/en/transform-v2/llm.md
index 6d036064de..8ee5a36a9a 100644
--- a/docs/en/transform-v2/llm.md
+++ b/docs/en/transform-v2/llm.md
@@ -11,11 +11,12 @@ more.
 ## Options
 
 | name                   | type   | required | default value |
-| ---------------------- | ------ | -------- | ------------- |
+|------------------------| ------ | -------- |---------------|
 | model_provider         | enum   | yes      |               |
 | output_data_type       | enum   | no       | String        |
+| output_column_name     | string | no       | llm_output    |
 | prompt                 | string | yes      |               |
-| inference_columns   | list   | no       |               |
+| inference_columns      | list   | no       |               |
 | model                  | string | yes      |               |
 | api_key                | string | yes      |               |
 | api_path               | string | no       |               |
@@ -35,6 +36,10 @@ The data type of the output data. The available options are:
 STRING,INT,BIGINT,DOUBLE,BOOLEAN.
 Default value is STRING.
 
+### output_column_name
+
+Custom output data field name. A custom field name that is the same as an 
existing field name is replaced with 'llm_output'.
+
 ### prompt
 
 The prompt to send to the LLM. This parameter defines how LLM will process and 
return data, eg:
diff --git a/docs/zh/transform-v2/llm.md b/docs/zh/transform-v2/llm.md
index 3ce53b78a6..c6f7aeefea 100644
--- a/docs/zh/transform-v2/llm.md
+++ b/docs/zh/transform-v2/llm.md
@@ -8,19 +8,20 @@
 
 ## 属性
 
-| 名称                   | 类型   | 是否必须 | 默认值 |
-| ---------------------- | ------ | -------- | ------ |
-| model_provider         | enum   | yes      |        |
-| output_data_type       | enum   | no       | String |
-| prompt                 | string | yes      |        |
-| inference_columns   | list   | no       |        |
-| model                  | string | yes      |        |
-| api_key                | string | yes      |        |
-| api_path               | string | no       |        |
-| custom_config          | map    | no       |        |
-| custom_response_parse  | string | no       |        |
-| custom_request_headers | map    | no       |        |
-| custom_request_body    | map    | no       |        |
+| 名称                     | 类型   | 是否必须 | 默认值         |
+|------------------------| ------ | -------- |-------------|
+| model_provider         | enum   | yes      |             |
+| output_data_type       | enum   | no       | String      |
+| output_column_name     | string | no       | llm_output   |
+| prompt                 | string | yes      |             |
+| inference_columns      | list   | no       |             |
+| model                  | string | yes      |             |
+| api_key                | string | yes      |             |
+| api_path               | string | no       |             |
+| custom_config          | map    | no       |             |
+| custom_response_parse  | string | no       |             |
+| custom_request_headers | map    | no       |             |
+| custom_request_body    | map    | no       |             |
 
 ### model_provider
 
@@ -33,6 +34,10 @@ OPENAI、DOUBAO、KIMIAI、CUSTOM
 STRING,INT,BIGINT,DOUBLE,BOOLEAN.
 默认值为 STRING。
 
+### output_column_name
+
+自定义输出数据字段名称。自定义字段名称与现有字段名称相同时,将替换为`llm_output`。
+
 ### prompt
 
 发送到 LLM 的提示。此参数定义 LLM 将如何处理和返回数据，例如:
diff --git 
a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/java/org/apache/seatunnel/e2e/transform/TestLLMIT.java
 
b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/java/org/apache/seatunnel/e2e/transform/TestLLMIT.java
index b97d7182e1..d98a5e7e33 100644
--- 
a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/java/org/apache/seatunnel/e2e/transform/TestLLMIT.java
+++ 
b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/java/org/apache/seatunnel/e2e/transform/TestLLMIT.java
@@ -104,6 +104,14 @@ public class TestLLMIT extends TestSuiteBase implements 
TestResource {
         Assertions.assertEquals(0, execResult.getExitCode());
     }
 
+    @TestTemplate
+    public void testLLMWithOpenAIOutputColumnName(TestContainer container)
+            throws IOException, InterruptedException {
+        Container.ExecResult execResult =
+                
container.executeJob("/llm_openai_transform_custom_output_name.conf");
+        Assertions.assertEquals(0, execResult.getExitCode());
+    }
+
     @TestTemplate
     public void testLLMWithCustomModel(TestContainer container)
             throws IOException, InterruptedException {
diff --git 
a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/llm_openai_transform_custom_output_name.conf
 
b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/llm_openai_transform_custom_output_name.conf
new file mode 100644
index 0000000000..c3d17dc423
--- /dev/null
+++ 
b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/llm_openai_transform_custom_output_name.conf
@@ -0,0 +1,76 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+######
+###### This config file is a demonstration of streaming processing in 
seatunnel config
+######
+
+env {
+  job.mode = "BATCH"
+}
+
+source {
+  FakeSource {
+    row.num = 5
+    schema = {
+      fields {
+        id = "int"
+        name = "string"
+      }
+    }
+    rows = [
+      {fields = [1, "Jia Fan"], kind = INSERT}
+      {fields = [2, "Hailin Wang"], kind = INSERT}
+      {fields = [3, "Tomas"], kind = INSERT}
+      {fields = [4, "Eric"], kind = INSERT}
+      {fields = [5, "Guangdong Liu"], kind = INSERT}
+    ]
+    result_table_name = "fake"
+  }
+}
+
+transform {
+  LLM {
+    source_table_name = "fake"
+    model_provider = OPENAI
+    model = gpt-4o-mini
+    api_key = sk-xxx
+    output_column_name = "nationality"
+    prompt = "Determine whether someone is Chinese or American by their name"
+    openai.api_path = "http://mockserver:1080/v1/chat/completions";
+    result_table_name = "llm_output"
+  }
+}
+
+sink {
+  Assert {
+    source_table_name = "llm_output"
+    rules =
+      {
+        field_rules = [
+          {
+            field_name = "nationality"
+            field_type = string
+            field_value = [
+              {
+                rule_type = NOT_NULL
+              }
+            ]
+          }
+        ]
+      }
+  }
+}
diff --git 
a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java
 
b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java
index a29fd677ca..08ae42e443 100644
--- 
a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java
+++ 
b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java
@@ -36,6 +36,7 @@ import 
org.apache.seatunnel.transform.nlpmodel.llm.remote.openai.OpenAIModel;
 import lombok.NonNull;
 import lombok.SneakyThrows;
 
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 
@@ -150,8 +151,15 @@ public class LLMTransform extends 
SingleFieldOutputTransform {
 
     @Override
     protected Column getOutputColumn() {
+        String customFieldName = 
config.get(LLMTransformConfig.OUTPUT_COLUMN_NAME);
+        String[] fieldNames = 
inputCatalogTable.getTableSchema().getFieldNames();
+        boolean isExist = Arrays.asList(fieldNames).contains(customFieldName);
+        if (isExist) {
+            throw new IllegalArgumentException(
+                    String.format("llm inference field name %s already 
exists", customFieldName));
+        }
         return PhysicalColumn.of(
-                "llm_output", outputDataType, (Long) null, true, null, "Output 
column of LLM");
+                customFieldName, outputDataType, (Long) null, true, null, 
"Output column of LLM");
     }
 
     @SneakyThrows
diff --git 
a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransformConfig.java
 
b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransformConfig.java
index c45bfb8f39..b26e4791ce 100644
--- 
a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransformConfig.java
+++ 
b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransformConfig.java
@@ -37,6 +37,12 @@ public class LLMTransformConfig extends ModelTransformConfig 
{
                     .noDefaultValue()
                     .withDescription("The row projection field of each 
inference");
 
+    public static final Option<String> OUTPUT_COLUMN_NAME =
+            Options.key("output_column_name")
+                    .stringType()
+                    .defaultValue("llm_output")
+                    .withDescription("custom field name for the llm output 
data");
+
     public static final Option<Integer> INFERENCE_BATCH_SIZE =
             Options.key("inference_batch_size")
                     .intType()

(seatunnel) branch dev updated: [Feature][Transforms-V2] LLM transforms Support custom field name (#7640)

Reply via email to