This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch docs/pipes-updates in repository https://gitbox.apache.org/repos/asf/tika.git
commit cbc65c9bb88b85763660bb6d556a3dd87e5601d9 Author: tallison <[email protected]> AuthorDate: Mon May 11 16:11:28 2026 -0400 gcs --- docs/modules/ROOT/examples/pipes-gcs-emitter.json | 1 + docs/modules/ROOT/examples/pipes-gcs-fetcher.json | 1 + docs/modules/ROOT/examples/pipes-gcs-iterator.json | 1 + docs/modules/ROOT/examples/pipes-gcs-pipeline.json | 1 + docs/modules/ROOT/nav.adoc | 1 + docs/modules/ROOT/pages/pipes/plugins/gcs.adoc | 166 +++++++++++++++++++++ .../apache/tika/pipes/gcs/ConfigExamplesTest.java | 133 +++++++++++++++++ .../resources/config-examples/gcs-emitter.json | 12 ++ .../resources/config-examples/gcs-fetcher.json | 12 ++ .../resources/config-examples/gcs-pipeline.json | 42 ++++++ .../config-examples/gcs-pipes-iterator.json | 11 ++ 11 files changed, 381 insertions(+) diff --git a/docs/modules/ROOT/examples/pipes-gcs-emitter.json b/docs/modules/ROOT/examples/pipes-gcs-emitter.json new file mode 120000 index 0000000000..48c994f74a --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-gcs-emitter.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-gcs-fetcher.json b/docs/modules/ROOT/examples/pipes-gcs-fetcher.json new file mode 120000 index 0000000000..8b390e310c --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-gcs-fetcher.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-gcs-iterator.json b/docs/modules/ROOT/examples/pipes-gcs-iterator.json new file mode 120000 index 0000000000..d4f6b6b934 --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-gcs-iterator.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/pipes-gcs-pipeline.json b/docs/modules/ROOT/examples/pipes-gcs-pipeline.json new file mode 120000 index 0000000000..621bad767e --- /dev/null +++ b/docs/modules/ROOT/examples/pipes-gcs-pipeline.json @@ -0,0 +1 @@ +../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json \ No newline at end of file diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 90fce8701d..e5e2a09624 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -34,6 +34,7 @@ ** xref:pipes/plugins/index.adoc[Plugins] *** xref:pipes/plugins/filesystem.adoc[File System] *** xref:pipes/plugins/s3.adoc[Amazon S3] +*** xref:pipes/plugins/gcs.adoc[Google Cloud Storage] * xref:configuration/index.adoc[Configuration] ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser] ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR] diff --git a/docs/modules/ROOT/pages/pipes/plugins/gcs.adoc b/docs/modules/ROOT/pages/pipes/plugins/gcs.adoc new file mode 100644 index 0000000000..d639580d0f --- /dev/null +++ b/docs/modules/ROOT/pages/pipes/plugins/gcs.adoc @@ -0,0 +1,166 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Google Cloud Storage Plugin +:toc: +:toclevels: 3 + +The Google Cloud Storage plugin (`tika-pipes-gcs`) provides fetcher, emitter, and iterator interfaces for objects in GCS buckets. + +[cols="2,1,3"] +|=== +|Interface |Component name |Class + +|Fetcher +|`gcs-fetcher` +|`GCSFetcher` + +|Emitter +|`gcs-emitter` +|`GCSEmitter` + +|Iterator +|`gcs-pipes-iterator` +|`GCSPipesIterator` +|=== + +[#credentials] +== Credentials + +The GCS plugin relies on Google's Application Default Credentials chain — there are no credential fields in the JSON config itself. Set credentials by: + +* Running on a GCP service (GCE/GKE/Cloud Run) — uses the attached service account automatically. +* Setting the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of a service-account JSON key. +* Running `gcloud auth application-default login` for local development. + +The `projectId` field in each component selects which GCP project to bill the API calls against; the service account or user must have storage access to the named bucket. + +[#gcs-fetcher] +== GCS Fetcher (`gcs-fetcher`) + +Reads objects from a GCS bucket. The fetch key is the object name. + +[source,json] +---- +include::example$pipes-gcs-fetcher.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`projectId` +|_required_ +|GCP project ID for billing/authentication. + +|`bucket` +|_required_ +|GCS bucket name. + +|`spoolToTemp` +|`true` +|If `true`, the fetched object is spooled to a temp file before parsing. + +|`extractUserMetadata` +|`true` +|If `true`, GCS custom metadata is copied into the parsed `Metadata`. +|=== + +[#gcs-emitter] +== GCS Emitter (`gcs-emitter`) + +Writes parsed results to a GCS bucket. The emit key (relative to `prefix`) is derived from the `FetchEmitTuple`. + +[source,json] +---- +include::example$pipes-gcs-emitter.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`projectId` +|_required_ +|GCP project ID (validated non-blank). + +|`bucket` +|_required_ +|Destination GCS bucket (validated non-blank). + +|`prefix` +|_no default_ +|Optional object-name prefix. A trailing `/` is stripped automatically. + +|`fileExtension` +|`json` +|Extension appended to each emitted object name. +|=== + +[#gcs-iterator] +== GCS Iterator (`gcs-pipes-iterator`) + +Lists objects under a bucket/prefix and emits one `FetchEmitTuple` per object. + +[source,json] +---- +include::example$pipes-gcs-iterator.json[] +---- + +=== Configuration + +[cols="1,1,3"] +|=== +|Field |Default |Description + +|`bucket` +|_required_ +|GCS bucket to enumerate. + +|`projectId` +|`""` +|GCP project ID for the listing API call. + +|`prefix` +|`""` +|Object-name prefix to scope the listing. + +|`fetcherId` / `emitterId` +|_required_ +|IDs of the fetcher and emitter to bind to each emitted tuple. See xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract. +|=== + +[#gcs-pipeline] +== Complete Pipeline Example + +The example below wires the GCS fetcher, emitter, and iterator together for a bucket-to-bucket pipeline. + +[source,json] +---- +include::example$pipes-gcs-pipeline.json[] +---- + +[#notes] +== Notes + +* The GCS plugin uses the official `google-cloud-storage` SDK. Set `GOOGLE_APPLICATION_CREDENTIALS` (or rely on workload identity / metadata server) to authenticate. +* Each component creates its own `Storage` client. Heavy throughput should be balanced against your project's per-second request quota. +* Unlike S3, there is no `path-style` toggle — GCS uses a single global endpoint. diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/gcs/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/gcs/ConfigExamplesTest.java new file mode 100644 index 0000000000..7cfc1f3fb1 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/gcs/ConfigExamplesTest.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.gcs; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.pipes.emitter.gcs.GCSEmitterConfig; +import org.apache.tika.pipes.fetcher.gcs.config.GCSFetcherConfig; +import org.apache.tika.pipes.iterator.gcs.GCSPipesIteratorConfig; + +/** + * Validates GCS fetcher/emitter/iterator configuration examples used in documentation. + * <p> + * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + private String readExample(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } + + private void loadViaTikaLoader(String resourceName) throws Exception { + String json = readExample(resourceName); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + + private JsonNode innerComponent(String json, String section, String id, String typeName) + throws Exception { + JsonNode root = OBJECT_MAPPER.readTree(json); + JsonNode sectionNode = root.get(section); + assertNotNull(sectionNode, "Missing section: " + section); + JsonNode idNode = id == null ? sectionNode : sectionNode.get(id); + assertNotNull(idNode, "Missing id: " + id); + JsonNode typed = idNode.get(typeName); + assertNotNull(typed, "Missing type: " + typeName); + return typed; + } + + @Test + public void testGCSFetcherConfig() throws Exception { + loadViaTikaLoader("gcs-fetcher.json"); + + JsonNode inner = innerComponent(readExample("gcs-fetcher.json"), + "fetchers", "gcsf", "gcs-fetcher"); + GCSFetcherConfig config = GCSFetcherConfig.load(inner.toString()); + assertEquals("my-gcp-project", config.getProjectId()); + assertEquals("my-tika-input", config.getBucket()); + } + + @Test + public void testGCSEmitterConfig() throws Exception { + loadViaTikaLoader("gcs-emitter.json"); + + JsonNode inner = innerComponent(readExample("gcs-emitter.json"), + "emitters", "gcse", "gcs-emitter"); + GCSEmitterConfig config = GCSEmitterConfig.load(inner.toString()); + assertEquals("my-gcp-project", config.projectId()); + assertEquals("my-tika-output", config.bucket()); + assertEquals("json", config.fileExtension()); + config.validate(); + assertEquals("results", config.getNormalizedPrefix()); + } + + @Test + public void testGCSIteratorConfig() throws Exception { + loadViaTikaLoader("gcs-pipes-iterator.json"); + + JsonNode inner = innerComponent(readExample("gcs-pipes-iterator.json"), + "pipes-iterator", null, "gcs-pipes-iterator"); + GCSPipesIteratorConfig config = GCSPipesIteratorConfig.load(inner.toString()); + assertEquals("my-gcp-project", config.getProjectId()); + assertEquals("my-tika-input", config.getBucket()); + assertEquals("gcsf", config.getFetcherId()); + assertEquals("gcse", config.getEmitterId()); + } + + @Test + public void testGCSPipelineConfig() throws Exception { + loadViaTikaLoader("gcs-pipeline.json"); + + String json = readExample("gcs-pipeline.json"); + GCSFetcherConfig fetcher = GCSFetcherConfig.load( + innerComponent(json, "fetchers", "gcsf", "gcs-fetcher").toString()); + GCSEmitterConfig emitter = GCSEmitterConfig.load( + innerComponent(json, "emitters", "gcse", "gcs-emitter").toString()); + GCSPipesIteratorConfig iterator = GCSPipesIteratorConfig.load( + innerComponent(json, "pipes-iterator", null, "gcs-pipes-iterator").toString()); + + emitter.validate(); + assertEquals(fetcher.getBucket(), iterator.getBucket()); + assertEquals("gcsf", iterator.getFetcherId()); + assertEquals("gcse", iterator.getEmitterId()); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json new file mode 100644 index 0000000000..6ba0603792 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json @@ -0,0 +1,12 @@ +{ + "emitters": { + "gcse": { + "gcs-emitter": { + "projectId": "my-gcp-project", + "bucket": "my-tika-output", + "prefix": "results/", + "fileExtension": "json" + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json new file mode 100644 index 0000000000..89ab85eed3 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json @@ -0,0 +1,12 @@ +{ + "fetchers": { + "gcsf": { + "gcs-fetcher": { + "projectId": "my-gcp-project", + "bucket": "my-tika-input", + "extractUserMetadata": true, + "spoolToTemp": true + } + } + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json new file mode 100644 index 0000000000..8c483e5104 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json @@ -0,0 +1,42 @@ +{ + "content-handler-factory": { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": -1, + "throwOnWriteLimitReached": true + } + }, + "fetchers": { + "gcsf": { + "gcs-fetcher": { + "projectId": "my-gcp-project", + "bucket": "my-tika-input", + "extractUserMetadata": true + } + } + }, + "emitters": { + "gcse": { + "gcs-emitter": { + "projectId": "my-gcp-project", + "bucket": "my-tika-output", + "prefix": "results/", + "fileExtension": "json" + } + } + }, + "pipes-iterator": { + "gcs-pipes-iterator": { + "projectId": "my-gcp-project", + "bucket": "my-tika-input", + "prefix": "incoming/", + "fetcherId": "gcsf", + "emitterId": "gcse" + } + }, + "pipes": { + "parseMode": "RMETA", + "onParseException": "EMIT", + "numClients": 4 + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json new file mode 100644 index 0000000000..756e087848 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json @@ -0,0 +1,11 @@ +{ + "pipes-iterator": { + "gcs-pipes-iterator": { + "projectId": "my-gcp-project", + "bucket": "my-tika-input", + "prefix": "incoming/", + "fetcherId": "gcsf", + "emitterId": "gcse" + } + } +}
