This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch docs/pipes-updates in repository https://gitbox.apache.org/repos/asf/tika.git
commit e6d9e53c1239aa23b6630a5bd3a5646275e45224 Author: tallison <[email protected]> AuthorDate: Mon May 11 16:07:05 2026 -0400 add s3 --- docs/modules/ROOT/examples/pipes-s3-emitter.json | 1 + docs/modules/ROOT/examples/pipes-s3-fetcher.json | 1 + docs/modules/ROOT/examples/pipes-s3-iterator.json | 1 + docs/modules/ROOT/examples/pipes-s3-pipeline.json | 1 + docs/modules/ROOT/nav.adoc | 1 + docs/modules/ROOT/pages/pipes/plugins/index.adoc | 2 +- docs/modules/ROOT/pages/pipes/plugins/s3.adoc | 242 +++++++++++++++++++++ .../apache/tika/pipes/s3/ConfigExamplesTest.java | 136 ++++++++++++ .../test/resources/config-examples/s3-emitter.json | 14 ++ .../test/resources/config-examples/s3-fetcher.json | 15 ++ .../resources/config-examples/s3-pipeline.json | 49 +++++ .../config-examples/s3-pipes-iterator.json | 13 ++ 12 files changed, 475 insertions(+), 1 deletion(-) diff --git a/docs/modules/ROOT/examples/pipes-s3-emitter.json b/docs/modules/ROOT/examples/pipes-s3-emitter.json new file mode 120000 index 0000000000..6f05a73ec2 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-s3-emitter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-s3-fetcher.json b/docs/modules/ROOT/examples/pipes-s3-fetcher.json new file mode 120000 index 0000000000..b24bd4fa27 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-s3-fetcher.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-s3-iterator.json b/docs/modules/ROOT/examples/pipes-s3-iterator.json new file mode 120000 index 0000000000..db1b210e82 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-s3-iterator.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-s3-pipeline.json b/docs/modules/ROOT/examples/pipes-s3-pipeline.json new file mode 120000 index 0000000000..cc6f573ec2 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-s3-pipeline.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json \ No newline at end of file diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index ef16b190dd..90fce8701d 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -33,6 +33,7 @@ ** xref:pipes/cpu-sizing.adoc[Forked-JVM CPU Sizing] ** xref:pipes/plugins/index.adoc[Plugins] *** xref:pipes/plugins/filesystem.adoc[File System] +*** xref:pipes/plugins/s3.adoc[Amazon S3] * xref:configuration/index.adoc[Configuration] ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser] ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR] diff --git a/docs/modules/ROOT/pages/pipes/plugins/index.adoc b/docs/modules/ROOT/pages/pipes/plugins/index.adoc index 8542fa2034..d5173d2032 100644 --- a/docs/modules/ROOT/pages/pipes/plugins/index.adoc +++ b/docs/modules/ROOT/pages/pipes/plugins/index.adoc @@ -125,7 +125,7 @@ Many plugins implement more than one (e.g., the S3 plugin provides fetcher, emit == Interface Overviews -For descriptions of the interfaces themselves — their contracts, the shared concepts (`FetchKey`, `FetchEmitTuple`, `baseConfig`, etc.), and how they fit into a pipeline — see: +For descriptions of the interfaces themselves — their contracts, the shared concepts (`FetchKey`, `FetchEmitTuple`, `fetcherId`/`emitterId` wiring, etc.), and how they fit into a pipeline — see: * xref:pipes/fetchers.adoc[Fetchers] * xref:pipes/emitters.adoc[Emitters] diff --git a/docs/modules/ROOT/pages/pipes/plugins/s3.adoc b/docs/modules/ROOT/pages/pipes/plugins/s3.adoc new file mode 100644 index 0000000000..90d0960f06 --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/s3.adoc @@ -0,0 +1,242 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Amazon S3 Plugin +:toc: +:toclevels: 3 + +The Amazon S3 plugin (`tika-pipes-s3`) provides fetcher, emitter, and iterator interfaces for objects in S3 (or any S3-compatible service such as MinIO). + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Fetcher +|`s3-fetcher` +|`S3Fetcher` + +|Emitter +|`s3-emitter` +|`S3Emitter` + +|Iterator +|`s3-pipes-iterator` +|`S3PipesIterator` +|=== + +[#credentials] +== Credentials + +All three components share the same `credentialsProvider` selector: + +* `profile` — reads credentials from the local AWS profile named by `profile` (e.g., `default`). +* `instance` — uses the instance/container role attached to the host (EC2 IAM role, ECS task role, etc.). No additional fields needed. +* `key_secret` — reads `accessKey` and `secretKey` from the config. Avoid checking these into source control; prefer environment-variable substitution or one of the other providers. + +The emitter's `validate()` enforces these values, but the fetcher and iterator do not — they fail later when the AWS SDK tries to resolve credentials. + +[#s3-fetcher] +== S3 Fetcher (`s3-fetcher`) + +Reads objects from an S3 bucket. The fetch key is the S3 key under `prefix` (if set). + +[source,json] +---- +include::example$pipes-s3-fetcher.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`bucket` +|_required_ +|S3 bucket name. + +|`region` +|_required_ +|AWS region (e.g., `us-east-1`). + +|`prefix` +|_no default_ +|Optional key prefix. Fetch keys are resolved underneath this prefix. + +|`credentialsProvider` +|_required_ +|One of `profile`, `instance`, `key_secret`. See <<credentials>>. + +|`profile` / `accessKey` / `secretKey` +|_conditional_ +|Required by the matching `credentialsProvider`. + +|`spoolToTemp` +|`true` +|If `true`, the fetched object is spooled to a temp file before being parsed. + +|`extractUserMetadata` +|`true` +|If `true`, S3 user-metadata is copied into the parsed `Metadata`. + +|`maxConnections` +|`0` +|Maximum HTTP connections in the S3 client pool. `0` lets the SDK pick a default. + +|`maxLength` +|`-1` +|Maximum object size, in bytes. `-1` means no limit. + +|`endpointConfigurationService` +|_no default_ +|Custom S3 endpoint, for S3-compatible services such as MinIO or LocalStack. + +|`pathStyleAccessEnabled` +|`false` +|Force path-style URLs (e.g., `https://endpoint/bucket/key`). Required by some S3-compatible services. + +|`throttleSeconds` +|_no default_ +|Optional rate-limit array; consecutive failures sleep for the corresponding number of seconds. +|=== + +[#s3-emitter] +== S3 Emitter (`s3-emitter`) + +Writes parsed results back to an S3 bucket. The emit key (relative to `prefix`) is derived from the `FetchEmitTuple`. + +[source,json] +---- +include::example$pipes-s3-emitter.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`bucket` +|_required_ +|Destination S3 bucket name (validated non-blank). + +|`region` +|_required_ +|AWS region (validated non-blank). + +|`credentialsProvider` +|_required_ +|One of `profile`, `instance`, `key_secret` (validated). See <<credentials>>. + +|`profile` / `accessKey` / `secretKey` +|_conditional_ +|Required by the matching `credentialsProvider` (validated). + +|`prefix` +|_no default_ +|Optional key prefix. A trailing `/` is stripped automatically. + +|`fileExtension` +|`json` +|Extension appended to each emitted key. + +|`spoolToTemp` +|`true` +|If `true`, output is spooled locally before being uploaded. + +|`maxConnections` +|`50` +|Maximum HTTP connections in the S3 client pool. + +|`endpointConfigurationService` +|_no default_ +|Custom S3 endpoint, for S3-compatible services. + +|`pathStyleAccessEnabled` +|`false` +|Force path-style URLs. +|=== + +[#s3-iterator] +== S3 Iterator (`s3-pipes-iterator`) + +Lists objects under a bucket/prefix and emits one `FetchEmitTuple` per object found. + +[source,json] +---- +include::example$pipes-s3-iterator.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`bucket` +|_required_ +|S3 bucket to enumerate. + +|`region` +|_required_ +|AWS region. + +|`prefix` +|`""` +|Key prefix to scope the listing. + +|`credentialsProvider` +|_optional_ +|One of `profile`, `instance`, `key_secret`. See <<credentials>>. + +|`profile` / `accessKey` / `secretKey` / `endpointConfigurationService` +|_conditional_ +|Auth fields, mirroring the fetcher and emitter. + +|`fileNamePattern` +|_no default_ +|Optional regex; only keys whose name matches are emitted. + +|`maxConnections` +|`50` +|Maximum HTTP connections in the S3 client pool. + +|`pathStyleAccessEnabled` +|`false` +|Force path-style URLs. + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +[#s3-pipeline] +== Complete Pipeline Example + +The example below wires the S3 fetcher, emitter, and iterator into a complete pipeline that lists `s3://my-tika-input/incoming/` and writes results to `s3://my-tika-output/results/`. + +[source,json] +---- +include::example$pipes-s3-pipeline.json[] +---- + +[#notes] +== Notes + +* The fetcher, emitter, and iterator each maintain their own S3 client. Auth and endpoint settings need to be configured per component, not globally. +* The S3 SDK enforces TLS 1.2+ by default; in-flight encryption is on. For at-rest encryption, configure bucket-level SSE on the AWS side. +* When using `endpointConfigurationService` against MinIO or LocalStack, you almost always need `pathStyleAccessEnabled: true`. diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/s3/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/s3/ConfigExamplesTest.java new file mode 100644 index 0000000000..f248d8194e --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/java/org/apache/tika/pipes/s3/ConfigExamplesTest.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.s3; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.emitter.s3.S3EmitterConfig; +import org.apache.tika.pipes.fetcher.s3.config.S3FetcherConfig; +import org.apache.tika.pipes.iterator.s3.S3PipesIteratorConfig; + +/** + * Validates S3 fetcher/emitter/iterator configuration examples used in documentation. + * <p> + * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + private void loadViaTikaLoader(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + + private JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } + + @Test + public void testS3FetcherConfig() throws Exception { + loadViaTikaLoader("s3-fetcher.json"); + + JsonNode inner = innerComponent(readExample("s3-fetcher.json"), + "fetchers", "s3f", "s3-fetcher"); + S3FetcherConfig config = S3FetcherConfig.load(inner.toString()); + assertEquals("my-tika-input", config.getBucket()); + assertEquals("us-east-1", config.getRegion()); + assertEquals("profile", config.getCredentialsProvider()); + assertEquals("default", config.getProfile()); + } + + @Test + public void testS3EmitterConfig() throws Exception { + loadViaTikaLoader("s3-emitter.json"); + + JsonNode inner = innerComponent(readExample("s3-emitter.json"), + "emitters", "s3e", "s3-emitter"); + S3EmitterConfig config = S3EmitterConfig.load(inner.toString()); + assertEquals("my-tika-output", config.bucket()); + assertEquals("us-east-1", config.region()); + assertEquals("profile", config.credentialsProvider()); + assertEquals("json", config.fileExtension()); + // exercises required-field + credentialsProvider whitelist validation + config.validate(); + } + + @Test + public void testS3IteratorConfig() throws Exception { + loadViaTikaLoader("s3-pipes-iterator.json"); + + JsonNode inner = innerComponent(readExample("s3-pipes-iterator.json"), + "pipes-iterator", null, "s3-pipes-iterator"); + S3PipesIteratorConfig config = S3PipesIteratorConfig.load(inner.toString()); + assertEquals("my-tika-input", config.getBucket()); + assertEquals("us-east-1", config.getRegion()); + assertEquals("s3f", config.getFetcherId()); + assertEquals("s3e", config.getEmitterId()); + } + + @Test + public void testS3PipelineConfig() throws Exception { + loadViaTikaLoader("s3-pipeline.json"); + + String json = readExample("s3-pipeline.json"); + S3FetcherConfig fetcher = S3FetcherConfig.load( + innerComponent(json, "fetchers", "s3f", "s3-fetcher").toString()); + S3EmitterConfig emitter = S3EmitterConfig.load( + innerComponent(json, "emitters", "s3e", "s3-emitter").toString()); + S3PipesIteratorConfig iterator = S3PipesIteratorConfig.load( + innerComponent(json, "pipes-iterator", null, "s3-pipes-iterator").toString()); + + emitter.validate(); + assertEquals(fetcher.getBucket(), iterator.getBucket()); + assertEquals("s3f", iterator.getFetcherId()); + assertEquals("s3e", iterator.getEmitterId()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json new file mode 100644 index 0000000000..8cd5557db1 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-emitter.json @@ -0,0 +1,14 @@ +{ + "emitters": { + "s3e": { + "s3-emitter": { + "bucket": "my-tika-output", + "region": "us-east-1", + "prefix": "results/", + "fileExtension": "json", + "credentialsProvider": "profile", + "profile": "default" + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json new file mode 100644 index 0000000000..8047fee2b0 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-fetcher.json @@ -0,0 +1,15 @@ +{ + "fetchers": { + "s3f": { + "s3-fetcher": { + "bucket": "my-tika-input", + "region": "us-east-1", + "prefix": "incoming/", + "credentialsProvider": "profile", + "profile": "default", + "extractUserMetadata": true, + "spoolToTemp": true + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json new file mode 100644 index 0000000000..1f17aa7081 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipeline.json @@ -0,0 +1,49 @@ +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "throwOnWriteLimitReached": true + } + }, + "fetchers": { + "s3f": { + "s3-fetcher": { + "bucket": "my-tika-input", + "region": "us-east-1", + "prefix": "incoming/", + "credentialsProvider": "profile", + "profile": "default", + "extractUserMetadata": true + } + } + }, + "emitters": { + "s3e": { + "s3-emitter": { + "bucket": "my-tika-output", + "region": "us-east-1", + "prefix": "results/", + "fileExtension": "json", + "credentialsProvider": "profile", + "profile": "default" + } + } + }, + "pipes-iterator": { + "s3-pipes-iterator": { + "bucket": "my-tika-input", + "region": "us-east-1", + "prefix": "incoming/", + "credentialsProvider": "profile", + "profile": "default", + "fetcherId": "s3f", + "emitterId": "s3e" + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4 + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json new file mode 100644 index 0000000000..e1fb2e9875 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-s3/src/test/resources/config-examples/s3-pipes-iterator.json @@ -0,0 +1,13 @@ +{ + "pipes-iterator": { + "s3-pipes-iterator": { + "bucket": "my-tika-input", + "region": "us-east-1", + "prefix": "incoming/", + "credentialsProvider": "profile", + "profile": "default", + "fetcherId": "s3f", + "emitterId": "s3e" + } + } +}
