This is an automated email from the ASF dual-hosted git repository.
wanghailin pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/seatunnel.git
The following commit(s) were added to refs/heads/dev by this push:
new 6da7491efa [Feature][Transforms-V2] LLM transforms Support custom
field name (#7640)
6da7491efa is described below
commit 6da7491efa3959ba6a5aa0ccf6f4ea9c66e80d0b
Author: zhangdonghao <[email protected]>
AuthorDate: Sat Sep 14 13:52:00 2024 +0800
[Feature][Transforms-V2] LLM transforms Support custom field name (#7640)
---
docs/en/transform-v2/llm.md | 9 ++-
docs/zh/transform-v2/llm.md | 31 +++++----
.../apache/seatunnel/e2e/transform/TestLLMIT.java | 8 +++
.../llm_openai_transform_custom_output_name.conf | 76 ++++++++++++++++++++++
.../transform/nlpmodel/llm/LLMTransform.java | 10 ++-
.../transform/nlpmodel/llm/LLMTransformConfig.java | 6 ++
6 files changed, 124 insertions(+), 16 deletions(-)
diff --git a/docs/en/transform-v2/llm.md b/docs/en/transform-v2/llm.md
index 6d036064de..8ee5a36a9a 100644
--- a/docs/en/transform-v2/llm.md
+++ b/docs/en/transform-v2/llm.md
@@ -11,11 +11,12 @@ more.
## Options
| name | type | required | default value |
-| ---------------------- | ------ | -------- | ------------- |
+|------------------------| ------ | -------- |---------------|
| model_provider | enum | yes | |
| output_data_type | enum | no | String |
+| output_column_name | string | no | llm_output |
| prompt | string | yes | |
-| inference_columns | list | no | |
+| inference_columns | list | no | |
| model | string | yes | |
| api_key | string | yes | |
| api_path | string | no | |
@@ -35,6 +36,10 @@ The data type of the output data. The available options are:
STRING,INT,BIGINT,DOUBLE,BOOLEAN.
Default value is STRING.
+### output_column_name
+
+Custom output data field name. A custom field name that is the same as an
existing field name is replaced with 'llm_output'.
+
### prompt
The prompt to send to the LLM. This parameter defines how LLM will process and
return data, eg:
diff --git a/docs/zh/transform-v2/llm.md b/docs/zh/transform-v2/llm.md
index 3ce53b78a6..c6f7aeefea 100644
--- a/docs/zh/transform-v2/llm.md
+++ b/docs/zh/transform-v2/llm.md
@@ -8,19 +8,20 @@
## 属性
-| 名称 | 类型 | 是否必须 | 默认值 |
-| ---------------------- | ------ | -------- | ------ |
-| model_provider | enum | yes | |
-| output_data_type | enum | no | String |
-| prompt | string | yes | |
-| inference_columns | list | no | |
-| model | string | yes | |
-| api_key | string | yes | |
-| api_path | string | no | |
-| custom_config | map | no | |
-| custom_response_parse | string | no | |
-| custom_request_headers | map | no | |
-| custom_request_body | map | no | |
+| 名称 | 类型 | 是否必须 | 默认值 |
+|------------------------| ------ | -------- |-------------|
+| model_provider | enum | yes | |
+| output_data_type | enum | no | String |
+| output_column_name | string | no | llm_output |
+| prompt | string | yes | |
+| inference_columns | list | no | |
+| model | string | yes | |
+| api_key | string | yes | |
+| api_path | string | no | |
+| custom_config | map | no | |
+| custom_response_parse | string | no | |
+| custom_request_headers | map | no | |
+| custom_request_body | map | no | |
### model_provider
@@ -33,6 +34,10 @@ OPENAI、DOUBAO、KIMIAI、CUSTOM
STRING,INT,BIGINT,DOUBLE,BOOLEAN.
默认值为 STRING。
+### output_column_name
+
+自定义输出数据字段名称。自定义字段名称与现有字段名称相同时,将替换为`llm_output`。
+
### prompt
发送到 LLM 的提示。此参数定义 LLM 将如何处理和返回数据,例如:
diff --git
a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/java/org/apache/seatunnel/e2e/transform/TestLLMIT.java
b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/java/org/apache/seatunnel/e2e/transform/TestLLMIT.java
index b97d7182e1..d98a5e7e33 100644
---
a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/java/org/apache/seatunnel/e2e/transform/TestLLMIT.java
+++
b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/java/org/apache/seatunnel/e2e/transform/TestLLMIT.java
@@ -104,6 +104,14 @@ public class TestLLMIT extends TestSuiteBase implements
TestResource {
Assertions.assertEquals(0, execResult.getExitCode());
}
+ @TestTemplate
+ public void testLLMWithOpenAIOutputColumnName(TestContainer container)
+ throws IOException, InterruptedException {
+ Container.ExecResult execResult =
+
container.executeJob("/llm_openai_transform_custom_output_name.conf");
+ Assertions.assertEquals(0, execResult.getExitCode());
+ }
+
@TestTemplate
public void testLLMWithCustomModel(TestContainer container)
throws IOException, InterruptedException {
diff --git
a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/llm_openai_transform_custom_output_name.conf
b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/llm_openai_transform_custom_output_name.conf
new file mode 100644
index 0000000000..c3d17dc423
--- /dev/null
+++
b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/llm_openai_transform_custom_output_name.conf
@@ -0,0 +1,76 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+######
+###### This config file is a demonstration of streaming processing in
seatunnel config
+######
+
+env {
+ job.mode = "BATCH"
+}
+
+source {
+ FakeSource {
+ row.num = 5
+ schema = {
+ fields {
+ id = "int"
+ name = "string"
+ }
+ }
+ rows = [
+ {fields = [1, "Jia Fan"], kind = INSERT}
+ {fields = [2, "Hailin Wang"], kind = INSERT}
+ {fields = [3, "Tomas"], kind = INSERT}
+ {fields = [4, "Eric"], kind = INSERT}
+ {fields = [5, "Guangdong Liu"], kind = INSERT}
+ ]
+ result_table_name = "fake"
+ }
+}
+
+transform {
+ LLM {
+ source_table_name = "fake"
+ model_provider = OPENAI
+ model = gpt-4o-mini
+ api_key = sk-xxx
+ output_column_name = "nationality"
+ prompt = "Determine whether someone is Chinese or American by their name"
+ openai.api_path = "http://mockserver:1080/v1/chat/completions"
+ result_table_name = "llm_output"
+ }
+}
+
+sink {
+ Assert {
+ source_table_name = "llm_output"
+ rules =
+ {
+ field_rules = [
+ {
+ field_name = "nationality"
+ field_type = string
+ field_value = [
+ {
+ rule_type = NOT_NULL
+ }
+ ]
+ }
+ ]
+ }
+ }
+}
diff --git
a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java
b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java
index a29fd677ca..08ae42e443 100644
---
a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java
+++
b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java
@@ -36,6 +36,7 @@ import
org.apache.seatunnel.transform.nlpmodel.llm.remote.openai.OpenAIModel;
import lombok.NonNull;
import lombok.SneakyThrows;
+import java.util.Arrays;
import java.util.Collections;
import java.util.List;
@@ -150,8 +151,15 @@ public class LLMTransform extends
SingleFieldOutputTransform {
@Override
protected Column getOutputColumn() {
+ String customFieldName =
config.get(LLMTransformConfig.OUTPUT_COLUMN_NAME);
+ String[] fieldNames =
inputCatalogTable.getTableSchema().getFieldNames();
+ boolean isExist = Arrays.asList(fieldNames).contains(customFieldName);
+ if (isExist) {
+ throw new IllegalArgumentException(
+ String.format("llm inference field name %s already
exists", customFieldName));
+ }
return PhysicalColumn.of(
- "llm_output", outputDataType, (Long) null, true, null, "Output
column of LLM");
+ customFieldName, outputDataType, (Long) null, true, null,
"Output column of LLM");
}
@SneakyThrows
diff --git
a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransformConfig.java
b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransformConfig.java
index c45bfb8f39..b26e4791ce 100644
---
a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransformConfig.java
+++
b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransformConfig.java
@@ -37,6 +37,12 @@ public class LLMTransformConfig extends ModelTransformConfig
{
.noDefaultValue()
.withDescription("The row projection field of each
inference");
+ public static final Option<String> OUTPUT_COLUMN_NAME =
+ Options.key("output_column_name")
+ .stringType()
+ .defaultValue("llm_output")
+ .withDescription("custom field name for the llm output
data");
+
public static final Option<Integer> INFERENCE_BATCH_SIZE =
Options.key("inference_batch_size")
.intType()