hi,YangZe,Leonard,
我增加了一个可以复现问题的测试类,可以执行下看看。可以明显观察到,两个sink在有PK时写入正常,在没有PK时只有一条记录(id是索引名)。
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import
org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.StatementSet;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import static org.apache.flink.table.api.Expressions.$;
public class ESNewJobTest {
//构建StreamExecutionEnvironment
public static final StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
//构建EnvironmentSettings 并指定Blink Planner
private static final EnvironmentSettings bsSettings =
EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
//构建StreamTableEnvironment
public static final StreamTableEnvironment tEnv =
StreamTableEnvironment.create(env, bsSettings);
//DDL语句
public static final String ES_SINK_DDL_NO_PK = "CREATE TABLE
es_sink_test_no_pk (\n" +
" idx integer,\n" +
" firstx varchar\n" +
") WITH (\n" +
"'connector' = 'elasticsearch-6',\n" +
"'hosts' = '168.61.113.171:9200',\n" +
"'index' = 'es_sink_test_no_pk',\n" +
"'document-type' = 'default',\n" +
"'document-id.key-delimiter' = '$',\n" +
"'sink.bulk-flush.interval' = '1000',\n" +
"'failure-handler' = 'fail',\n" +
"'format' = 'json'\n" +
")";
public static final String ES_SINK_DDL_WITH_PK = "CREATE TABLE
es_sink_test_with_pk (\n" +
" idx integer,\n" +
" firstx varchar,\n" +
" primary key (idx, firstx) not enforced\n" +
") WITH (\n" +
"'connector' = 'elasticsearch-6',\n" +
"'hosts' = '168.61.113.171:9200',\n" +
"'index' = 'es_sink_test_with_pk',\n" +
"'document-type' = 'default',\n" +
"'document-id.key-delimiter' = '$',\n" +
"'sink.bulk-flush.interval' = '1000',\n" +
"'failure-handler' = 'fail',\n" +
"'format' = 'json'\n" +
")";
public static String getCharAndNumr(int length) {
StringBuffer valSb = new StringBuffer();
for (int i = 0; i < length; i++) {
String charOrNum = Math.round(Math.random()) % 2 == 0 ? "char" :
"num"; // 输出字母还是数字
if ("char".equalsIgnoreCase(charOrNum)) {
// 字符串
int choice = Math.round(Math.random()) % 2 == 0 ? 65 : 97; //
取得大写字母还是小写字母
valSb.append((char) (choice + Math.round(Math.random()*25)));
} else if ("num".equalsIgnoreCase(charOrNum)) {
// 数字
valSb.append(String.valueOf(Math.round(Math.random()*9)));
}
}
return valSb.toString();
}
public static void main(String[] args) throws Exception {
DataStream<Row> ds = env.addSource(new
RichParallelSourceFunction<Row>() {
volatile boolean flag = true;
@Override
public void run(SourceContext<Row> ctx) throws Exception {
while (flag) {
Row row = new Row(2);
row.setField(0, 2207);
row.setField(1, getCharAndNumr(4));
ctx.collect(row);
Thread.sleep(1000);
}
}
@Override
public void cancel() {
flag = false;
}
}).setParallelism(1).returns(Types.ROW(Types.INT, Types.STRING));
//ES sink测试ddl
tEnv.executeSql(ES_SINK_DDL_NO_PK);
tEnv.executeSql(ES_SINK_DDL_WITH_PK);
//source注册成表
tEnv.createTemporaryView("test", ds, $("f0").as("idx"),
$("f1").as("firstx"), $("p").proctime());
//sink写入
StatementSet ss = tEnv.createStatementSet();
ss.addInsertSql("insert into es_sink_test_no_pk select idx, firstx from
test");
ss.addInsertSql("insert into es_sink_test_with_pk select idx, firstx
from test");
ss.execute();
}
}
在 2020-07-13 14:03:21,"Yangze Guo" <[email protected]> 写道:
>INSERT走的就是processUpsert这个方法,当不指定PK时,生成的key会是null,然后创建一个IndexRequest。
>
>Best,
>Yangze Guo
>
>On Mon, Jul 13, 2020 at 2:00 PM sunfulin <[email protected]> wrote:
>>
>>
>> hi, Leonard
>> 我定义了一个ddl和一个dml,sql如下。ddl中没有定义PK,我观察到的现象是:这样在sink到es结果中,结果生成的id是index名,导致只有一条记录。
>> 我将DDL更换为之前版本的with参数(声明使用update-mode =
>> ‘upsert’),不使用1.11最新的with参数,观察到sink结果就正常了。不确定是不是我哪边配置的方式不太对,还是说使用方式有问题。
>>
>> @[email protected]
>> 我看了下给的源代码,貌似这个是处理upsert的情况,如果不声明pk的话,是不是会是processInsert?
>>
>> CREATE TABLE ES6_SENSORDATA_SERVER_API (
>> event varchar,
>> user_id varchar,
>> distinct_id varchar,
>> _date varchar,
>> _event_time varchar,
>> recv_time varchar,
>> code varchar,
>> _current_project varchar,
>> api varchar,
>> elapsed int ,
>> `start` bigint,
>> is_err int
>> ) WITH (
>> 'connector' = 'elasticsearch-6',
>> 'hosts' = '<ES_YUNTU.SERVERS>',
>> 'index' = 'flink_sensordata_server_api',
>> 'document-type' = 'default',
>> 'document-id.key-delimiter' = '$',
>> 'sink.bulk-flush.interval' = '1000',
>> 'failure-handler' = 'fail',
>> 'format' = 'json'
>> )
>>
>>
>>
>> INSERT INTO ES6_SENSORDATA_SERVER_API
>>
>> SELECT event,
>>
>> user_id,
>>
>> distinct_id,
>>
>> ts2Date(`time`, 'yyyy-MM-dd') as _date,
>>
>> ts2Date(`time`, 'yyyy-MM-dd HH:mm:ss.SSS') as _event_time,
>>
>> ts2Date(recv_time, false, false) as recv_time,
>>
>> properties.code as code,
>>
>> properties.`project` as _current_project,
>>
>> properties.api as api,
>>
>> properties.elapsed as elapsed,
>>
>> properties.`start` as `start`,
>>
>> case when properties.code = '0' then 0 else 1 end as is_err
>>
>> FROM KafkaEventTopic
>>
>> where `type` in ('track') and event in ('serverApiReqEvt')
>>
>>
>> 在 2020-07-13 13:44:29,"Leonard Xu" <[email protected]> 写道:
>> >Hello, fulin
>> >
>> >这个问题能提供段可以复现的代码吗?
>> >
>> >祝好,
>> >Leonard Xu
>> >
>> >
>> >> 在 2020年7月13日,09:50,Yangze Guo <[email protected]> 写道:
>> >>
>> >> Hi,
>> >>
>> >> 如果没有定义主键,ES connector 会把 _id设为null[1],这样ES的Java Client会将_id设为一个随机值[2].
>> >> 所以应该不会出现您说的这种情况。您那里的ES有没有请求日志之类的,看一下Flink发过来的请求是什么样的。
>> >>
>> >> [1]
>> >> https://github.com/apache/flink/blob/f0eeaec530e001ab02cb889dfe217e25913660c4/flink-connectors/flink-connector-elasticsearch-base/src/main/java/org/apache/flink/streaming/connectors/elasticsearch/table/RowElasticsearchSinkFunction.java#L102
>> >> [2]
>> >> https://github.com/elastic/elasticsearch/blob/977230a0ce89a55515dc6ef6452e9f059d9356a2/core/src/main/java/org/elasticsearch/action/index/IndexRequest.java#L509
>> >>
>> >> Best,
>> >> Yangze Guo
>> >>
>> >> On Sat, Jul 11, 2020 at 11:33 PM sunfulin <[email protected]> wrote:
>> >>>
>> >>> hi,
>> >>> 根据文档[1]的描述,1.11的es sql connector如果在ddl里没有声明primary
>> >>> key,将会使用append模式sink数据,并使用es本身生成的id作为document_id。但是我在测试时发现,如果我的ddl里没有定义primary
>> >>> key,写入时没有正确生成document_id,反而是将index作为id生成了。导致只有最新的一条记录。下面是我的ddl定义:
>> >>> 不确定是我配置使用的方式不对,还是确实存在bug。。
>> >>>
>> >>>
>> >>> CREATE TABLE ES6_SENSORDATA_OUTPUT (
>> >>> event varchar,
>> >>> user_id varchar,
>> >>> distinct_id varchar,
>> >>> _date varchar,
>> >>> _event_time varchar,
>> >>> recv_time varchar,
>> >>> _browser_version varchar,
>> >>> path_name varchar,
>> >>> _search varchar,
>> >>> event_type varchar,
>> >>> _current_project varchar,
>> >>> message varchar,
>> >>> stack varchar,
>> >>> component_stack varchar,
>> >>> _screen_width varchar,
>> >>> _screen_height varchar
>> >>> ) WITH (
>> >>> 'connector' = 'elasticsearch-6',
>> >>> 'hosts' = '<ES_YUNTU.SERVERS>',
>> >>> 'index' = 'flink_sensordata_target_event',
>> >>> 'document-type' = 'default',
>> >>> 'document-id.key-delimiter' = '$',
>> >>> 'sink.bulk-flush.interval' = '1000',
>> >>> 'failure-handler' = 'fail',
>> >>> 'format' = 'json'
>> >>> )
>> >>>
>> >>>
>> >>>
>> >>>
>> >>> [1]https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/connectors/elasticsearch.html#key-handling
>>
>>
>>
>>