This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4252 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 63fcc02887eb8e8733cdfad7d614d375efa0d55c Author: tallison <[email protected]> AuthorDate: Thu May 9 12:20:13 2024 -0400 TIKA-4252 - revert and add user test to confirm user metadata is in results --- .../java/org/apache/tika/pipes/PipesServer.java | 22 ++++---- .../org/apache/tika/pipes/PipesClientTest.java | 66 ++++++++++++++++++++++ .../resources/org/apache/tika/pipes/TIKA-4252.xml | 30 ++++++++++ 3 files changed, 106 insertions(+), 12 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java index 20a5def59..98192d694 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java @@ -371,8 +371,13 @@ public class PipesServer implements Runnable { MetadataListAndEmbeddedBytes parseData = null; try { - //this can be null if there is a fetch exception - parseData = parseFromTuple(t, fetcher); + try { + parseData = parseFromTuple(t, fetcher); + } catch (IOException | TikaException e) { + LOG.warn("fetch exception " + t.getId(), e); + write(STATUS.FETCH_EXCEPTION, ExceptionUtils.getStackTrace(e)); + return; + } if (LOG.isTraceEnabled()) { LOG.trace("timer -- to parse: {} ms", System.currentTimeMillis() - start); @@ -455,37 +460,30 @@ public class PipesServer implements Runnable { } } - protected MetadataListAndEmbeddedBytes parseFromTuple(FetchEmitTuple t, Fetcher fetcher) { + protected MetadataListAndEmbeddedBytes parseFromTuple(FetchEmitTuple t, Fetcher fetcher) throws TikaException, IOException { FetchKey fetchKey = t.getFetchKey(); if (fetchKey.hasRange()) { if (!(fetcher instanceof RangeFetcher)) { throw new IllegalArgumentException( "fetch key has a range, but the fetcher is not a range fetcher"); } - Metadata metadata = t.getMetadata() == null ? new Metadata() : t.getMetadata(); + Metadata metadata = new Metadata(); try (InputStream stream = ((RangeFetcher) fetcher).fetch(fetchKey.getFetchKey(), fetchKey.getRangeStart(), fetchKey.getRangeEnd(), metadata)) { return parseWithStream(t, stream, metadata); } catch (SecurityException e) { LOG.error("security exception " + t.getId(), e); throw e; - } catch (TikaException | IOException e) { - LOG.warn("fetch exception " + t.getId(), e); - write(STATUS.FETCH_EXCEPTION, ExceptionUtils.getStackTrace(e)); } } else { - Metadata metadata = t.getMetadata() == null ? new Metadata() : t.getMetadata(); + Metadata metadata = new Metadata(); try (InputStream stream = fetcher.fetch(t.getFetchKey().getFetchKey(), metadata)) { return parseWithStream(t, stream, metadata); } catch (SecurityException e) { LOG.error("security exception " + t.getId(), e); throw e; - } catch (TikaException | IOException e) { - LOG.warn("fetch exception " + t.getId(), e); - write(STATUS.FETCH_EXCEPTION, ExceptionUtils.getStackTrace(e)); } } - return null; } private String getNoFetcherMsg(String fetcherName) { diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java b/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java new file mode 100644 index 000000000..a8c182ddc --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.pipes.emitter.EmitKey; +import org.apache.tika.pipes.fetcher.FetchKey; + +public class PipesClientTest { + + @Test + public void testUserMetadataAndNoEmitter(@TempDir Path tmp) throws Exception { + Path tikaConfigTemplate = Paths.get(PipesClientTest.class.getResource("TIKA-4252.xml").toURI()); + Path tikaConfig = tmp.resolve("tika-config.xml"); + String xml = Files.readString(tikaConfigTemplate, StandardCharsets.UTF_8); + xml = xml.replace("BASE_PATH", + Paths.get(PipesClientTest.class.getResource("/test-documents").toURI()).toAbsolutePath().toString()); + Files.writeString(tikaConfig, xml); + + List<Metadata> metadataList; + try (PipesClient pipesClient = new PipesClient(PipesConfig.load(tikaConfig))) { + FetchKey fetchKey = new FetchKey("fs", "mock_times.xml"); + Metadata userMetadata = new Metadata(); + userMetadata.set("k1", "v1"); + userMetadata.add("k2", "v2a"); + userMetadata.add("k2", "v2b"); + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple("my-id", fetchKey, new EmitKey(), userMetadata, HandlerConfig.DEFAULT_HANDLER_CONFIG, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP)); + metadataList = pipesResult + .getEmitData() + .getMetadataList(); + } + assertEquals("application/mock+xml", metadataList.get(0).get(Metadata.CONTENT_TYPE)); + assertContains("hello", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + assertEquals("v1", metadataList.get(0).get("k1")); + assertEquals("v2a", metadataList.get(0).getValues("k2")[0]); + assertEquals("v2b", metadataList.get(0).getValues("k2")[1]); + } +} diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4252.xml b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4252.xml new file mode 100644 index 000000000..036f0f2a5 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4252.xml @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <fetchers> + <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher"> + <name>fs</name> + <basePath>BASE_PATH</basePath> + </fetcher> + </fetchers> + <emitters> + <emitter class="org.apache.tika.pipes.async.MockEmitter"> + <name>e</name> + </emitter> + </emitters> +</properties> \ No newline at end of file
