This is an automated email from the ASF dual-hosted git repository.
kirs pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/incubator-seatunnel.git
The following commit(s) were added to refs/heads/dev by this push:
new 712b77744 [Feature][Connector-V2] Add oss source connector (#2467)
712b77744 is described below
commit 712b77744e04b53b0603295453e14c680235bb60
Author: TyrantLucifer <[email protected]>
AuthorDate: Tue Aug 23 19:43:35 2022 +0800
[Feature][Connector-V2] Add oss source connector (#2467)
* [Feature][Connector-V2] Update oss source connector doc
---
docs/en/connector-v2/source/OssFile.md | 128 +++++++++++++++++++++
plugin-mapping.properties | 1 +
pom.xml | 7 ++
seatunnel-connectors-v2-dist/pom.xml | 5 +
.../seatunnel/file/config/FileSystemType.java | 1 +
.../seatunnel/file/config/HadoopConf.java | 16 ++-
.../file/source/reader/AbstractReadStrategy.java | 1 +
.../{ => connector-file-oss}/pom.xml | 35 ++++--
.../seatunnel/file/oss/source/OssFileSource.java | 85 ++++++++++++++
.../seatunnel/file/oss/source/config/OssConf.java} | 21 ++--
.../file/oss/source/config/OssSourceConfig.java} | 21 ++--
.../services/org.apache.hadoop.fs.FileSystem | 16 +++
seatunnel-connectors-v2/connector-file/pom.xml | 1 +
13 files changed, 299 insertions(+), 39 deletions(-)
diff --git a/docs/en/connector-v2/source/OssFile.md
b/docs/en/connector-v2/source/OssFile.md
new file mode 100644
index 000000000..e81914f54
--- /dev/null
+++ b/docs/en/connector-v2/source/OssFile.md
@@ -0,0 +1,128 @@
+# OssFile
+
+> Oss file source connector
+
+## Description
+
+Read data from aliyun oss file system.
+
+> Tips: We made some trade-offs in order to support more file types, so we
used the HDFS protocol for internal access to OSS and this connector need some
hadoop dependencies.
+> It's only support hadoop version **2.9.X+**.
+
+## Options
+
+| name | type | required | default value |
+|--------------|--------|----------|---------------|
+| path | string | yes | - |
+| type | string | yes | - |
+| bucket | string | yes | - |
+| accessKey | string | yes | - |
+| accessSecret | string | yes | - |
+| endpoint | string | yes | - |
+| schema | config | no | - |
+
+### path [string]
+
+The source file path.
+
+### type [string]
+
+File type, supported as the following file types:
+
+`text` `csv` `parquet` `orc` `json`
+
+If you assign file type to `json`, you should also assign schema option to
tell connector how to parse data to the row you want.
+
+For example:
+
+upstream data is the following:
+
+```json
+
+{"code": 200, "data": "get success", "success": true}
+
+```
+
+you should assign schema as the following:
+
+```hocon
+
+schema {
+ fields {
+ code = int
+ data = string
+ success = boolean
+ }
+}
+
+```
+
+connector will generate data as the following:
+
+| code | data | success |
+|------|-------------|---------|
+| 200 | get success | true |
+
+If you assign file type to `parquet` `orc`, schema option not required,
connector can find the schema of upstream data automatically.
+
+If you assign file type to `text` `csv`, schema option not supported
temporarily, but the subsequent features will support.
+
+Now connector will treat the upstream data as the following:
+
+| lines |
+|-----------------------------------|
+| The content of every line in file |
+
+### bucket [string]
+
+The bucket address of oss file system, for example:
`oss://tyrantlucifer-image-bed`
+
+### accessKey [string]
+
+The access key of oss file system.
+
+### accessSecret [string]
+
+The access secret of oss file system.
+
+### endpoint [string]
+
+The endpoint of oss file system.
+
+### schema [config]
+
+The schema of upstream data.
+
+## Example
+
+```hocon
+
+ OssFile {
+ path = "/seatunnel/orc"
+ bucket = "oss://tyrantlucifer-image-bed"
+ accessKey = "xxxxxxxxxxxxxxxxx"
+ accessSecret = "xxxxxxxxxxxxxxxxxxxxxx"
+ endpoint = "oss-cn-beijing.aliyuncs.com"
+ type = "orc"
+ }
+
+```
+
+```hocon
+
+ OssFile {
+ path = "/seatunnel/json"
+ bucket = "oss://tyrantlucifer-image-bed"
+ accessKey = "xxxxxxxxxxxxxxxxx"
+ accessSecret = "xxxxxxxxxxxxxxxxxxxxxx"
+ endpoint = "oss-cn-beijing.aliyuncs.com"
+ type = "json"
+ schema {
+ fields {
+ id = int
+ name = string
+ }
+ }
+ }
+
+```
\ No newline at end of file
diff --git a/plugin-mapping.properties b/plugin-mapping.properties
index 64b008491..0efd44f1f 100644
--- a/plugin-mapping.properties
+++ b/plugin-mapping.properties
@@ -111,6 +111,7 @@ seatunnel.source.HdfsFile = connector-file-hadoop
seatunnel.sink.HdfsFile = connector-file-hadoop
seatunnel.source.LocalFile = connector-file-local
seatunnel.sink.LocalFile = connector-file-local
+seatunnel.source.OssFile = connector-file-oss
seatunnel.source.Pulsar = connector-pulsar
seatunnel.source.Hudi = connector-hudi
seatunnel.sink.DingTalk = connector-dingtalk
diff --git a/pom.xml b/pom.xml
index 6ee15ec06..1923e511b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -177,6 +177,7 @@
<maven-assembly-plugin.version>3.3.0</maven-assembly-plugin.version>
<spark.scope>provided</spark.scope>
<flink.scope>provided</flink.scope>
+ <hadoop-aliyun.version>2.9.2</hadoop-aliyun.version>
<codec.version>1.13</codec.version>
<httpclient.version>4.5.13</httpclient.version>
<httpcore.version>4.4.4</httpcore.version>
@@ -536,6 +537,12 @@
<version>${flink.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-aliyun</artifactId>
+ <version>${hadoop-aliyun.version}</version>
+ </dependency>
+
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr-bundle</artifactId>
diff --git a/seatunnel-connectors-v2-dist/pom.xml
b/seatunnel-connectors-v2-dist/pom.xml
index 9791af172..0b90e13cd 100644
--- a/seatunnel-connectors-v2-dist/pom.xml
+++ b/seatunnel-connectors-v2-dist/pom.xml
@@ -101,6 +101,11 @@
<artifactId>connector-file-local</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.seatunnel</groupId>
+ <artifactId>connector-file-oss</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<dependency>
<groupId>org.apache.seatunnel</groupId>
<artifactId>connector-file-ftp</artifactId>
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/FileSystemType.java
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/FileSystemType.java
index 3156c9796..f1d271f36 100644
---
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/FileSystemType.java
+++
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/FileSystemType.java
@@ -22,6 +22,7 @@ import java.io.Serializable;
public enum FileSystemType implements Serializable {
HDFS("HdfsFile"),
LOCAL("LocalFile"),
+ OSS("OssFile"),
FTP("FtpFile");
private final String fileSystemPluginName;
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/HadoopConf.java
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/HadoopConf.java
index f3bbedf08..6da660421 100644
---
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/HadoopConf.java
+++
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/HadoopConf.java
@@ -18,17 +18,25 @@
package org.apache.seatunnel.connectors.seatunnel.file.config;
import lombok.Data;
+import org.apache.hadoop.conf.Configuration;
import java.io.Serializable;
+import java.util.HashMap;
+import java.util.Map;
@Data
public class HadoopConf implements Serializable {
-
- private String hdfsNameKey;
-
- private String fsHdfsImpl = "org.apache.hadoop.hdfs.DistributedFileSystem";
+ protected Map<String, String> extraOptions = new HashMap<>();
+ protected String hdfsNameKey;
+ protected String fsHdfsImpl =
"org.apache.hadoop.hdfs.DistributedFileSystem";
public HadoopConf(String hdfsNameKey) {
this.hdfsNameKey = hdfsNameKey;
}
+
+ public void setExtraOptionsForConfiguration(Configuration configuration) {
+ if (!extraOptions.isEmpty()) {
+ extraOptions.forEach(configuration::set);
+ }
+ }
}
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java
index 215c4667c..789ecb7cb 100644
---
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java
+++
b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java
@@ -53,6 +53,7 @@ public abstract class AbstractReadStrategy implements
ReadStrategy {
if (hadoopConf != null) {
configuration.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY,
hadoopConf.getHdfsNameKey());
configuration.set("fs.hdfs.impl", hadoopConf.getFsHdfsImpl());
+ hadoopConf.setExtraOptionsForConfiguration(configuration);
}
return configuration;
}
diff --git a/seatunnel-connectors-v2/connector-file/pom.xml
b/seatunnel-connectors-v2/connector-file/connector-file-oss/pom.xml
similarity index 64%
copy from seatunnel-connectors-v2/connector-file/pom.xml
copy to seatunnel-connectors-v2/connector-file/connector-file-oss/pom.xml
index 951ccdc33..f3fb964d1 100644
--- a/seatunnel-connectors-v2/connector-file/pom.xml
+++ b/seatunnel-connectors-v2/connector-file/connector-file-oss/pom.xml
@@ -21,18 +21,33 @@
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
- <artifactId>seatunnel-connectors-v2</artifactId>
+ <artifactId>connector-file</artifactId>
<groupId>org.apache.seatunnel</groupId>
<version>${revision}</version>
</parent>
<modelVersion>4.0.0</modelVersion>
- <artifactId>connector-file</artifactId>
- <packaging>pom</packaging>
-
- <modules>
- <module>connector-file-base</module>
- <module>connector-file-hadoop</module>
- <module>connector-file-local</module>
- <module>connector-file-ftp</module>
- </modules>
+
+ <artifactId>connector-file-oss</artifactId>
+
+ <dependencies>
+
+ <dependency>
+ <groupId>org.apache.seatunnel</groupId>
+ <artifactId>connector-file-base</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.flink</groupId>
+ <artifactId>flink-shaded-hadoop-2</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-aliyun</artifactId>
+ </dependency>
+
+ </dependencies>
+
</project>
\ No newline at end of file
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/OssFileSource.java
b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/OssFileSource.java
new file mode 100644
index 000000000..b12efa07d
--- /dev/null
+++
b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/OssFileSource.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.seatunnel.file.oss.source;
+
+import org.apache.seatunnel.api.common.PrepareFailException;
+import org.apache.seatunnel.api.source.SeaTunnelSource;
+import org.apache.seatunnel.common.config.CheckConfigUtil;
+import org.apache.seatunnel.common.config.CheckResult;
+import org.apache.seatunnel.common.constants.PluginType;
+import org.apache.seatunnel.connectors.seatunnel.common.schema.SeatunnelSchema;
+import org.apache.seatunnel.connectors.seatunnel.file.config.FileSystemType;
+import
org.apache.seatunnel.connectors.seatunnel.file.exception.FilePluginException;
+import
org.apache.seatunnel.connectors.seatunnel.file.oss.source.config.OssConf;
+import
org.apache.seatunnel.connectors.seatunnel.file.oss.source.config.OssSourceConfig;
+import org.apache.seatunnel.connectors.seatunnel.file.source.BaseFileSource;
+import
org.apache.seatunnel.connectors.seatunnel.file.source.reader.ReadStrategyFactory;
+
+import org.apache.seatunnel.shade.com.typesafe.config.Config;
+
+import com.google.auto.service.AutoService;
+import org.apache.hadoop.fs.aliyun.oss.Constants;
+
+import java.io.IOException;
+import java.util.HashMap;
+
+@AutoService(SeaTunnelSource.class)
+public class OssFileSource extends BaseFileSource {
+ @Override
+ public String getPluginName() {
+ return FileSystemType.OSS.getFileSystemPluginName();
+ }
+
+ @Override
+ public void prepare(Config pluginConfig) throws PrepareFailException {
+ CheckResult result = CheckConfigUtil.checkAllExists(pluginConfig,
+ OssSourceConfig.FILE_PATH, OssSourceConfig.FILE_TYPE,
+ OssSourceConfig.BUCKET, OssSourceConfig.ACCESS_KEY,
+ OssSourceConfig.ACCESS_SECRET, OssSourceConfig.BUCKET);
+ if (!result.isSuccess()) {
+ throw new PrepareFailException(getPluginName(), PluginType.SOURCE,
result.getMsg());
+ }
+ readStrategy =
ReadStrategyFactory.of(pluginConfig.getString(OssSourceConfig.FILE_TYPE));
+ String path = pluginConfig.getString(OssSourceConfig.FILE_PATH);
+ hadoopConf = new
OssConf(pluginConfig.getString(OssSourceConfig.BUCKET));
+ HashMap<String, String> ossOptions = new HashMap<>();
+ ossOptions.put(Constants.ACCESS_KEY_ID,
pluginConfig.getString(OssSourceConfig.ACCESS_KEY));
+ ossOptions.put(Constants.ACCESS_KEY_SECRET,
pluginConfig.getString(OssSourceConfig.ACCESS_SECRET));
+ ossOptions.put(Constants.ENDPOINT_KEY,
pluginConfig.getString(OssSourceConfig.ENDPOINT));
+ hadoopConf.setExtraOptions(ossOptions);
+ try {
+ filePaths = readStrategy.getFileNamesByPath(hadoopConf, path);
+ } catch (IOException e) {
+ throw new PrepareFailException(getPluginName(), PluginType.SOURCE,
"Check file path fail.");
+ }
+ // support user-defined schema
+ if (pluginConfig.hasPath(OssSourceConfig.SCHEMA)) {
+ Config schemaConfig =
pluginConfig.getConfig(OssSourceConfig.SCHEMA);
+ rowType = SeatunnelSchema
+ .buildWithConfig(schemaConfig)
+ .getSeaTunnelRowType();
+ readStrategy.setSeaTunnelRowTypeInfo(rowType);
+ } else {
+ try {
+ rowType = readStrategy.getSeaTunnelRowTypeInfo(hadoopConf,
filePaths.get(0));
+ } catch (FilePluginException e) {
+ throw new PrepareFailException(getPluginName(),
PluginType.SOURCE, "Read file schema error.", e);
+ }
+ }
+ }
+}
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/HadoopConf.java
b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/config/OssConf.java
similarity index 65%
copy from
seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/HadoopConf.java
copy to
seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/config/OssConf.java
index f3bbedf08..d197ed17f 100644
---
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/HadoopConf.java
+++
b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/config/OssConf.java
@@ -15,20 +15,19 @@
* limitations under the License.
*/
-package org.apache.seatunnel.connectors.seatunnel.file.config;
+package org.apache.seatunnel.connectors.seatunnel.file.oss.source.config;
-import lombok.Data;
+import org.apache.seatunnel.connectors.seatunnel.file.config.HadoopConf;
-import java.io.Serializable;
+public class OssConf extends HadoopConf {
+ private final String fsHdfsImpl =
"org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem";
-@Data
-public class HadoopConf implements Serializable {
-
- private String hdfsNameKey;
-
- private String fsHdfsImpl = "org.apache.hadoop.hdfs.DistributedFileSystem";
+ @Override
+ public String getFsHdfsImpl() {
+ return fsHdfsImpl;
+ }
- public HadoopConf(String hdfsNameKey) {
- this.hdfsNameKey = hdfsNameKey;
+ public OssConf(String hdfsNameKey) {
+ super(hdfsNameKey);
}
}
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/HadoopConf.java
b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/config/OssSourceConfig.java
similarity index 65%
copy from
seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/HadoopConf.java
copy to
seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/config/OssSourceConfig.java
index f3bbedf08..fa6728757 100644
---
a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/HadoopConf.java
+++
b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/config/OssSourceConfig.java
@@ -15,20 +15,13 @@
* limitations under the License.
*/
-package org.apache.seatunnel.connectors.seatunnel.file.config;
+package org.apache.seatunnel.connectors.seatunnel.file.oss.source.config;
-import lombok.Data;
+import org.apache.seatunnel.connectors.seatunnel.file.config.BaseSourceConfig;
-import java.io.Serializable;
-
-@Data
-public class HadoopConf implements Serializable {
-
- private String hdfsNameKey;
-
- private String fsHdfsImpl = "org.apache.hadoop.hdfs.DistributedFileSystem";
-
- public HadoopConf(String hdfsNameKey) {
- this.hdfsNameKey = hdfsNameKey;
- }
+public class OssSourceConfig extends BaseSourceConfig {
+ public static final String ACCESS_KEY = "accessKey";
+ public static final String ACCESS_SECRET = "accessSecret";
+ public static final String ENDPOINT = "endpoint";
+ public static final String BUCKET = "bucket";
}
diff --git
a/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem
b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem
new file mode 100644
index 000000000..9d056b306
--- /dev/null
+++
b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem
\ No newline at end of file
diff --git a/seatunnel-connectors-v2/connector-file/pom.xml
b/seatunnel-connectors-v2/connector-file/pom.xml
index 951ccdc33..00a3cc1b7 100644
--- a/seatunnel-connectors-v2/connector-file/pom.xml
+++ b/seatunnel-connectors-v2/connector-file/pom.xml
@@ -33,6 +33,7 @@
<module>connector-file-base</module>
<module>connector-file-hadoop</module>
<module>connector-file-local</module>
+ <module>connector-file-oss</module>
<module>connector-file-ftp</module>
</modules>
</project>
\ No newline at end of file