This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 9be81d1 TIKA-3317 -- small cleanups
9be81d1 is described below
commit 9be81d17944043ad0003e17fd73b39432995d566
Author: tallison <[email protected]>
AuthorDate: Mon May 17 13:19:44 2021 -0400
TIKA-3317 -- small cleanups
---
.../java/org/apache/tika/pipes/PipesClient.java | 2 +-
.../java/org/apache/tika/pipes/PipesServer.java | 2 +-
tika-pipes/tika-emitters/tika-emitter-solr/pom.xml | 8 ++
.../tika/pipes/emitter/solr/SolrEmitter.java | 91 +++++++++---------
.../src/test/resources/log4j.properties | 20 ++--
tika-pipes/tika-httpclient-commons/pom.xml | 6 ++
tika-pipes/tika-pipes-integration-tests/pom.xml | 32 +++++++
.../tika/pipes/solrtest/TikaPipesSolr6Test.java | 6 +-
.../tika/pipes/solrtest/TikaPipesSolr7Test.java | 6 +-
.../tika/pipes/solrtest/TikaPipesSolr8Test.java | 6 +-
.../tika/pipes/solrtest/TikaPipesSolrTestBase.java | 104 +++++++++++----------
.../{log4j.properties => log4j2.properties} | 20 ++--
.../src/test/resources/tika-async-log4j.properties | 13 ---
.../test/resources/tika-async-log4j2.properties} | 21 +++--
.../tika-pipes-iterator-solr/pom.xml | 50 +++++-----
.../tika/pipes/solrtest/SolrPipesIterator.java | 41 ++++----
16 files changed, 239 insertions(+), 189 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
b/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
index 2db98b2..1b2eecd 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
@@ -343,7 +343,7 @@ public class PipesClient implements Closeable {
} else if (line.startsWith("error ")) {
SERVER_LOG.error(line.substring(6));
} else {
- SERVER_LOG.error(line);
+ SERVER_LOG.debug(line);
}
try {
line = reader.readLine();
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index e251dc8..6a5f6c3 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -184,7 +184,7 @@ public class PipesServer implements Runnable {
}
private void err(Throwable t) {
- System.err.println("err " +
ExceptionUtils.getStackTrace(t).replaceAll("[\r\n]", " "));
+ System.err.println("error " +
ExceptionUtils.getStackTrace(t).replaceAll("[\r\n]", " "));
System.err.flush();
}
diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
b/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
index 6d0d485..6fe2304 100644
--- a/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
+++ b/tika-pipes/tika-emitters/tika-emitter-solr/pom.xml
@@ -71,12 +71,20 @@
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-util</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-http</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient</artifactId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
diff --git
a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
index 78eca0d..534e899 100644
---
a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
+++
b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java
@@ -31,6 +31,9 @@ import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.LBHttpSolrClient;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.SolrInputDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import org.apache.tika.client.HttpClientFactory;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
@@ -41,26 +44,12 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.emitter.AbstractEmitter;
import org.apache.tika.pipes.emitter.EmitData;
import org.apache.tika.pipes.emitter.TikaEmitterException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-public class SolrEmitter extends AbstractEmitter implements Initializable {
-
- public enum AttachmentStrategy {
- SKIP,
- CONCATENATE_CONTENT,
- PARENT_CHILD,
- //anything else?
- }
- public enum UpdateStrategy {
- ADD,
- UPDATE_MUST_EXIST,
- UPDATE_MUST_NOT_EXIST,
- }
+public class SolrEmitter extends AbstractEmitter implements Initializable {
private static final Logger LOG =
LoggerFactory.getLogger(SolrEmitter.class);
-
+ private final HttpClientFactory httpClientFactory;
private AttachmentStrategy attachmentStrategy =
AttachmentStrategy.PARENT_CHILD;
private UpdateStrategy updateStrategy = UpdateStrategy.ADD;
private String solrCollection;
@@ -75,16 +64,14 @@ public class SolrEmitter extends AbstractEmitter implements
Initializable {
private int commitWithin = 1000;
private int connectionTimeout = 10000;
private int socketTimeout = 60000;
- private final HttpClientFactory httpClientFactory;
private SolrClient solrClient;
-
public SolrEmitter() throws TikaConfigException {
httpClientFactory = new HttpClientFactory();
}
@Override
- public void emit(String emitKey, List<Metadata> metadataList) throws
IOException,
- TikaEmitterException {
+ public void emit(String emitKey, List<Metadata> metadataList)
+ throws IOException, TikaEmitterException {
if (metadataList == null || metadataList.size() == 0) {
LOG.warn("metadataList is null or empty");
return;
@@ -104,8 +91,7 @@ public class SolrEmitter extends AbstractEmitter implements
Initializable {
} else if (updateStrategy == UpdateStrategy.UPDATE_MUST_NOT_EXIST) {
solrInputDocument.setField("_version_", -1);
}
- if (attachmentStrategy == AttachmentStrategy.SKIP ||
- metadataList.size() == 1) {
+ if (attachmentStrategy == AttachmentStrategy.SKIP ||
metadataList.size() == 1) {
addMetadataToSolrInputDocument(metadataList.get(0),
solrInputDocument, updateStrategy);
} else if (attachmentStrategy ==
AttachmentStrategy.CONCATENATE_CONTENT) {
//this only handles text for now, not xhtml
@@ -128,8 +114,8 @@ public class SolrEmitter extends AbstractEmitter implements
Initializable {
addMetadataToSolrInputDocument(m, childSolrInputDocument,
updateStrategy);
}
} else {
- throw new IllegalArgumentException("I don't yet support this
attachment strategy: "
- + attachmentStrategy);
+ throw new IllegalArgumentException(
+ "I don't yet support this attachment strategy: " +
attachmentStrategy);
}
docsToUpdate.add(solrInputDocument);
}
@@ -142,12 +128,14 @@ public class SolrEmitter extends AbstractEmitter
implements Initializable {
}
List<SolrInputDocument> docsToUpdate = new ArrayList<>();
for (EmitData d : batch) {
- addMetadataAsSolrInputDocuments(d.getEmitKey().getEmitKey(),
d.getMetadataList(), docsToUpdate);
+ addMetadataAsSolrInputDocuments(d.getEmitKey().getEmitKey(),
d.getMetadataList(),
+ docsToUpdate);
}
emitSolrBatch(docsToUpdate);
}
- private void emitSolrBatch(List<SolrInputDocument> docsToUpdate) throws
IOException, TikaEmitterException {
+ private void emitSolrBatch(List<SolrInputDocument> docsToUpdate)
+ throws IOException, TikaEmitterException {
if (LOG.isDebugEnabled()) {
LOG.debug("Emitting solr doc batch: {}", docsToUpdate);
}
@@ -164,7 +152,8 @@ public class SolrEmitter extends AbstractEmitter implements
Initializable {
}
}
- private void addMetadataToSolrInputDocument(Metadata metadata,
SolrInputDocument solrInputDocument,
+ private void addMetadataToSolrInputDocument(Metadata metadata,
+ SolrInputDocument
solrInputDocument,
UpdateStrategy updateStrategy)
{
for (String n : metadata.names()) {
String[] vals = metadata.getValues(n);
@@ -175,16 +164,18 @@ public class SolrEmitter extends AbstractEmitter
implements Initializable {
solrInputDocument.setField(n, vals[0]);
} else {
solrInputDocument.setField(n, new HashMap<String,
String>() {{
- put("set", vals[0]);
- }});
+ put("set", vals[0]);
+ }
+ });
}
} else if (vals.length > 1) {
if (updateStrategy == UpdateStrategy.ADD) {
solrInputDocument.setField(n, vals);
} else {
solrInputDocument.setField(n, new HashMap<String,
String[]>() {{
- put("set", vals);
- }});
+ put("set", vals);
+ }
+ });
}
}
}
@@ -220,6 +211,10 @@ public class SolrEmitter extends AbstractEmitter
implements Initializable {
this.socketTimeout = socketTimeout;
}
+ public String getContentField() {
+ return contentField;
+ }
+
/**
* This is the field _after_ metadata mappings have been applied
* that contains the "content" for each metadata object.
@@ -234,8 +229,8 @@ public class SolrEmitter extends AbstractEmitter implements
Initializable {
this.contentField = contentField;
}
- public String getContentField() {
- return contentField;
+ public int getCommitWithin() {
+ return commitWithin;
}
@Field
@@ -243,10 +238,6 @@ public class SolrEmitter extends AbstractEmitter
implements Initializable {
this.commitWithin = commitWithin;
}
- public int getCommitWithin() {
- return commitWithin;
- }
-
/**
* Specify the field in the first Metadata that should be
* used as the id field for the document.
@@ -308,21 +299,18 @@ public class SolrEmitter extends AbstractEmitter
implements Initializable {
public void initialize(Map<String, Param> params) throws
TikaConfigException {
if (solrUrls == null || solrUrls.isEmpty()) {
solrClient = new CloudSolrClient.Builder(solrZkHosts,
Optional.ofNullable(solrZkChroot))
- .withConnectionTimeout(connectionTimeout)
- .withSocketTimeout(socketTimeout)
- .withHttpClient(httpClientFactory.build())
- .build();
+
.withConnectionTimeout(connectionTimeout).withSocketTimeout(socketTimeout)
+ .withHttpClient(httpClientFactory.build()).build();
} else {
- solrClient = new LBHttpSolrClient.Builder()
- .withConnectionTimeout(connectionTimeout)
- .withSocketTimeout(socketTimeout)
- .withHttpClient(httpClientFactory.build())
- .withBaseSolrUrls(solrUrls.toArray(new String[]
{})).build();
+ solrClient = new
LBHttpSolrClient.Builder().withConnectionTimeout(connectionTimeout)
+
.withSocketTimeout(socketTimeout).withHttpClient(httpClientFactory.build())
+ .withBaseSolrUrls(solrUrls.toArray(new
String[]{})).build();
}
}
@Override
- public void checkInitialization(InitializableProblemHandler
problemHandler) throws TikaConfigException {
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
mustNotBeEmpty("solrCollection", this.solrCollection);
mustNotBeEmpty("urlFieldName", this.idField);
if ((this.solrUrls == null || this.solrUrls.isEmpty()) &&
@@ -336,4 +324,13 @@ public class SolrEmitter extends AbstractEmitter
implements Initializable {
"expected either param solrUrls or param solrZkHosts, but
both were specified");
}
}
+
+ public enum AttachmentStrategy {
+ SKIP, CONCATENATE_CONTENT, PARENT_CHILD,
+ //anything else?
+ }
+
+ public enum UpdateStrategy {
+ ADD, UPDATE_MUST_EXIST, UPDATE_MUST_NOT_EXIST,
+ }
}
diff --git
a/tika-pipes/tika-emitters/tika-emitter-solr/src/test/resources/log4j.properties
b/tika-pipes/tika-emitters/tika-emitter-solr/src/test/resources/log4j.properties
index 11e5887..d17a4a1 100644
---
a/tika-pipes/tika-emitters/tika-emitter-solr/src/test/resources/log4j.properties
+++
b/tika-pipes/tika-emitters/tika-emitter-solr/src/test/resources/log4j.properties
@@ -13,10 +13,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-#info,debug, error,fatal ...
-log4j.rootLogger=debug,stderr
-#console
-log4j.appender.stderr=org.apache.log4j.ConsoleAppender
-log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
-log4j.appender.stderr.Target=System.err
-log4j.appender.stderr.layout.ConversionPattern=%-5p %m%n
+status=info
+name=PropertiesConfig
+filters=threshold
+filter.threshold.type=ThresholdFilter
+filter.threshold.level=info
+appenders=console
+appender.console.type=Console
+appender.console.name=STDERR
+appender.console.layout.type=PatternLayout
+appender.console.layout.pattern=%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n
+rootLogger.level=info
+rootLogger.appenderRefs=stderr
+rootLogger.appenderRef.stderr.ref=STDERR
diff --git a/tika-pipes/tika-httpclient-commons/pom.xml
b/tika-pipes/tika-httpclient-commons/pom.xml
index 3bd4409..80af2b8 100644
--- a/tika-pipes/tika-httpclient-commons/pom.xml
+++ b/tika-pipes/tika-httpclient-commons/pom.xml
@@ -41,6 +41,12 @@
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>${httpcomponents.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
</dependencies>
diff --git a/tika-pipes/tika-pipes-integration-tests/pom.xml
b/tika-pipes/tika-pipes-integration-tests/pom.xml
index b4c4036..caa6214 100644
--- a/tika-pipes/tika-pipes-integration-tests/pom.xml
+++ b/tika-pipes/tika-pipes-integration-tests/pom.xml
@@ -76,6 +76,16 @@
<artifactId>testcontainers</artifactId>
<version>${test.containers.version}</version>
<scope>test</scope>
+ <exclusions>
+ <exclusion>
+ <groupId>net.java.dev.jna</groupId>
+ <artifactId>jna</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-annotations</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
@@ -99,6 +109,28 @@
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>${solrj.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-io</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-util</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-http</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient</artifactId>
+ </exclusion>
+ </exclusions>
<scope>test</scope>
</dependency>
</dependencies>
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr6Test.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr6Test.java
index 12e9ac7..5066fee 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr6Test.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr6Test.java
@@ -25,9 +25,9 @@ import org.testcontainers.utility.DockerImageName;
public class TikaPipesSolr6Test extends TikaPipesSolrTestBase {
@Rule
- public GenericContainer<?> solr6 = new
GenericContainer<>(DockerImageName.parse("solr:6"))
- .withExposedPorts(8983, 9983)
- .withCommand("-DzkRun");
+ public GenericContainer<?> solr6 =
+ new
GenericContainer<>(DockerImageName.parse("solr:6")).withExposedPorts(8983, 9983)
+ .withCommand("-DzkRun");
@Before
public void setupTest() throws Exception {
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr7Test.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr7Test.java
index c9cf566..314659d 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr7Test.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr7Test.java
@@ -25,9 +25,9 @@ import org.testcontainers.utility.DockerImageName;
public class TikaPipesSolr7Test extends TikaPipesSolrTestBase {
@Rule
- public GenericContainer<?> solr7 = new
GenericContainer<>(DockerImageName.parse("solr:7"))
- .withExposedPorts(8983, 9983)
- .withCommand("-DzkRun");
+ public GenericContainer<?> solr7 =
+ new
GenericContainer<>(DockerImageName.parse("solr:7")).withExposedPorts(8983, 9983)
+ .withCommand("-DzkRun");
@Before
public void setupTest() throws Exception {
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr8Test.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr8Test.java
index d1470df..29411f4 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr8Test.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolr8Test.java
@@ -25,9 +25,9 @@ import org.testcontainers.utility.DockerImageName;
public class TikaPipesSolr8Test extends TikaPipesSolrTestBase {
@Rule
- public GenericContainer<?> solr8 = new
GenericContainer<>(DockerImageName.parse("solr:8"))
- .withExposedPorts(8983, 9983)
- .withCommand("-DzkRun");
+ public GenericContainer<?> solr8 =
+ new
GenericContainer<>(DockerImageName.parse("solr:8")).withExposedPorts(8983, 9983)
+ .withCommand("-DzkRun");
@Before
public void setupTest() throws Exception {
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolrTestBase.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolrTestBase.java
index 12f3f77..8d80c32 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolrTestBase.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/solrtest/TikaPipesSolrTestBase.java
@@ -25,22 +25,21 @@ import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.impl.LBHttpSolrClient;
import org.apache.solr.common.SolrInputDocument;
-import org.apache.tika.cli.TikaCLI;
-import org.apache.tika.pipes.PipeIntegrationTests;
-import org.apache.tika.pipes.emitter.solr.SolrEmitter;
import org.jetbrains.annotations.NotNull;
import org.junit.Assert;
import org.testcontainers.containers.GenericContainer;
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
+import org.apache.tika.cli.TikaCLI;
+import org.apache.tika.pipes.PipeIntegrationTests;
+import org.apache.tika.pipes.emitter.solr.SolrEmitter;
+
public abstract class TikaPipesSolrTestBase {
private final String collection = "testcol";
private final int numDocs = 42;
-
- protected GenericContainer<?> solr;
-
private final File testFileFolder = new File("target", "test-files");
+ protected GenericContainer<?> solr;
private String solrHost;
private int solrPort;
private int zkPort;
@@ -49,7 +48,8 @@ public abstract class TikaPipesSolrTestBase {
private void createTestHtmlFiles(String bodyContent) throws Exception {
testFileFolder.mkdirs();
for (int i = 0; i < numDocs; ++i) {
- FileUtils.writeStringToFile(new File(testFileFolder, "test-" + i +
".html"), "<html><body>" + bodyContent + "</body></html>",
StandardCharsets.UTF_8);
+ FileUtils.writeStringToFile(new File(testFileFolder, "test-" + i +
".html"),
+ "<html><body>" + bodyContent + "</body></html>",
StandardCharsets.UTF_8);
}
}
@@ -63,8 +63,8 @@ public abstract class TikaPipesSolrTestBase {
solr.execInContainer("/opt/solr/bin/solr", "create_collection", "-c",
collection);
- try (SolrClient solrClient = new LBHttpSolrClient.Builder()
- .withBaseSolrUrls(solrEndpoint).build()) {
+ try (SolrClient solrClient = new
LBHttpSolrClient.Builder().withBaseSolrUrls(solrEndpoint)
+ .build()) {
for (int i = 0; i < numDocs; ++i) {
SolrInputDocument solrDoc = new SolrInputDocument();
@@ -79,76 +79,84 @@ public abstract class TikaPipesSolrTestBase {
/**
* Runs a test using Solr Pipe Iterator, File Fetcher and Solr Emitter.
+ *
* @param useZk If true, use zookeeper to connect to solr. Otherwise use
direct solr URLs.
*/
- protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter(boolean
useZk) throws Exception {
+ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter(boolean
useZk)
+ throws Exception {
File tikaConfigFile = new File("target", "ta.xml");
- File log4jPropFile = new File("target", "tmp-log4j.properties");
- try (InputStream is =
PipeIntegrationTests.class.getResourceAsStream("/tika-async-log4j.properties"))
{
+ File log4jPropFile = new File("target", "tmp-log4j2.properties");
+ try (InputStream is = PipeIntegrationTests.class
+ .getResourceAsStream("/tika-async-log4j2.properties")) {
FileUtils.copyInputStreamToFile(is, log4jPropFile);
}
String tikaConfigTemplateXml;
- try (InputStream is =
PipeIntegrationTests.class.getResourceAsStream("/tika-config-solr-urls.xml")) {
+ try (InputStream is = PipeIntegrationTests.class
+ .getResourceAsStream("/tika-config-solr-urls.xml")) {
tikaConfigTemplateXml = IOUtils.toString(is,
StandardCharsets.UTF_8);
}
- String tikaConfigXml = createTikaConfigXml(useZk,
- tikaConfigFile,
- log4jPropFile,
- tikaConfigTemplateXml,
- SolrEmitter.UpdateStrategy.ADD,
- SolrEmitter.AttachmentStrategy.CONCATENATE_CONTENT);
+ String tikaConfigXml =
+ createTikaConfigXml(useZk, tikaConfigFile, log4jPropFile,
tikaConfigTemplateXml,
+ SolrEmitter.UpdateStrategy.ADD,
+ SolrEmitter.AttachmentStrategy.CONCATENATE_CONTENT);
FileUtils.writeStringToFile(tikaConfigFile, tikaConfigXml,
StandardCharsets.UTF_8);
TikaCLI.main(new String[]{"-a", "--config=" +
tikaConfigFile.getAbsolutePath()});
- try (SolrClient solrClient = new LBHttpSolrClient.Builder()
- .withBaseSolrUrls(solrEndpoint).build()) {
+ try (SolrClient solrClient = new
LBHttpSolrClient.Builder().withBaseSolrUrls(solrEndpoint)
+ .build()) {
solrClient.commit(collection);
- Assert.assertEquals(numDocs, solrClient.query(collection, new
SolrQuery("mime_s:\"text/html;
charset=ISO-8859-1\"")).getResults().getNumFound());
- Assert.assertEquals(numDocs, solrClient.query(collection, new
SolrQuery("content_s:*initial*")).getResults().getNumFound());
+ Assert.assertEquals(numDocs, solrClient
+ .query(collection, new SolrQuery("mime_s:\"text/html;
charset=ISO-8859-1\""))
+ .getResults().getNumFound());
+ Assert.assertEquals(numDocs,
+ solrClient.query(collection, new
SolrQuery("content_s:*initial*")).getResults()
+ .getNumFound());
}
- // update the documents with "update must exist" and run tika async
again with "UPDATE_MUST_EXIST". It should not fail, and docs should be updated.
+ // update the documents with "update must exist" and run tika async
again with "UPDATE_MUST_EXIST".
+ // It should not fail, and docs should be updated.
createTestHtmlFiles("updated");
- tikaConfigXml = createTikaConfigXml(useZk,
- tikaConfigFile,
- log4jPropFile,
- tikaConfigTemplateXml,
- SolrEmitter.UpdateStrategy.UPDATE_MUST_EXIST,
- SolrEmitter.AttachmentStrategy.CONCATENATE_CONTENT);
+ tikaConfigXml =
+ createTikaConfigXml(useZk, tikaConfigFile, log4jPropFile,
tikaConfigTemplateXml,
+ SolrEmitter.UpdateStrategy.UPDATE_MUST_EXIST,
+ SolrEmitter.AttachmentStrategy.CONCATENATE_CONTENT);
FileUtils.writeStringToFile(tikaConfigFile, tikaConfigXml,
StandardCharsets.UTF_8);
TikaCLI.main(new String[]{"-a", "--config=" +
tikaConfigFile.getAbsolutePath()});
- try (SolrClient solrClient = new LBHttpSolrClient.Builder()
- .withBaseSolrUrls(solrEndpoint).build()) {
+ try (SolrClient solrClient = new
LBHttpSolrClient.Builder().withBaseSolrUrls(solrEndpoint)
+ .build()) {
solrClient.commit(collection);
- Assert.assertEquals(numDocs, solrClient.query(collection, new
SolrQuery("mime_s:\"text/html;
charset=ISO-8859-1\"")).getResults().getNumFound());
- Assert.assertEquals(numDocs, solrClient.query(collection, new
SolrQuery("content_s:*updated*")).getResults().getNumFound());
+ Assert.assertEquals(numDocs, solrClient
+ .query(collection, new SolrQuery("mime_s:\"text/html;
charset=ISO-8859-1\""))
+ .getResults().getNumFound());
+ Assert.assertEquals(numDocs,
+ solrClient.query(collection, new
SolrQuery("content_s:*updated*")).getResults()
+ .getNumFound());
}
}
@NotNull
- private String createTikaConfigXml(boolean useZk,
- File tikaConfigFile,
- File log4jPropFile,
+ private String createTikaConfigXml(boolean useZk, File tikaConfigFile,
File log4jPropFile,
String tikaConfigTemplateXml,
SolrEmitter.UpdateStrategy
updateStrategy,
SolrEmitter.AttachmentStrategy
attachmentStrategy) {
- String res = tikaConfigTemplateXml.replace("{TIKA_CONFIG}",
tikaConfigFile.getAbsolutePath())
- .replace("{UPDATE_STRATEGY}", updateStrategy.toString())
- .replace("{ATTACHMENT_STRATEGY}",
attachmentStrategy.toString())
- .replace("{LOG4J_PROPERTIES_FILE}",
log4jPropFile.getAbsolutePath())
- .replace("{PATH_TO_DOCS}", testFileFolder.getAbsolutePath());
+ String res =
+ tikaConfigTemplateXml.replace("{TIKA_CONFIG}",
tikaConfigFile.getAbsolutePath())
+ .replace("{UPDATE_STRATEGY}",
updateStrategy.toString())
+ .replace("{ATTACHMENT_STRATEGY}",
attachmentStrategy.toString())
+ .replace("{LOG4J_PROPERTIES_FILE}",
log4jPropFile.getAbsolutePath())
+ .replace("{PATH_TO_DOCS}",
testFileFolder.getAbsolutePath());
if (useZk) {
- res = res.replace("{SOLR_CONNECTION}", "<solrZkHosts>\n" +
- " <solrZkHost>" + solrHost + ":" + zkPort +
"</solrZkHost>\n" +
- " </solrZkHosts>\n");
+ res = res.replace("{SOLR_CONNECTION}",
+ "<solrZkHosts>\n" + " <solrZkHost>" + solrHost +
":" + zkPort +
+ "</solrZkHost>\n" + " </solrZkHosts>\n");
} else {
- res = res.replace("{SOLR_CONNECTION}", "<solrUrls>\n" +
- " <solrUrl>http://" + solrHost + ":" + solrPort +
"/solr</solrUrl>\n" +
- " </solrUrls>\n");
+ res = res.replace("{SOLR_CONNECTION}",
+ "<solrUrls>\n" + " <solrUrl>http://" + solrHost +
":" + solrPort +
+ "/solr</solrUrl>\n" + " </solrUrls>\n");
}
return res;
}
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/log4j.properties
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/log4j2.properties
similarity index 66%
rename from
tika-pipes/tika-pipes-integration-tests/src/test/resources/log4j.properties
rename to
tika-pipes/tika-pipes-integration-tests/src/test/resources/log4j2.properties
index 2b2da1a..d17a4a1 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/log4j.properties
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/log4j2.properties
@@ -13,10 +13,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-#info,debug, error,fatal ...
-log4j.rootLogger=info,stderr
-#console
-log4j.appender.stderr=org.apache.log4j.ConsoleAppender
-log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
-log4j.appender.stderr.Target=System.err
-log4j.appender.stderr.layout.ConversionPattern=%-5p [%t]: %m%n
+status=info
+name=PropertiesConfig
+filters=threshold
+filter.threshold.type=ThresholdFilter
+filter.threshold.level=info
+appenders=console
+appender.console.type=Console
+appender.console.name=STDERR
+appender.console.layout.type=PatternLayout
+appender.console.layout.pattern=%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n
+rootLogger.level=info
+rootLogger.appenderRefs=stderr
+rootLogger.appenderRef.stderr.ref=STDERR
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/tika-async-log4j.properties
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/tika-async-log4j.properties
deleted file mode 100644
index c7c6821..0000000
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/tika-async-log4j.properties
+++ /dev/null
@@ -1,13 +0,0 @@
-status=debug
-name=PropertiesConfig
-filters=threshold
-filter.threshold.type=ThresholdFilter
-filter.threshold.level=debug
-appenders=console
-appender.console.type=Console
-appender.console.name=STDERR
-appender.console.layout.type=PatternLayout
-appender.console.layout.pattern=%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n
-rootLogger.level=debug
-rootLogger.appenderRefs=stderr
-rootLogger.appenderRef.stderr.ref=STDERR
diff --git
a/tika-pipes/tika-emitters/tika-emitter-solr/src/test/resources/log4j.properties
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/tika-async-log4j2.properties
similarity index 66%
copy from
tika-pipes/tika-emitters/tika-emitter-solr/src/test/resources/log4j.properties
copy to
tika-pipes/tika-pipes-integration-tests/src/test/resources/tika-async-log4j2.properties
index 11e5887..bc6f2fd 100644
---
a/tika-pipes/tika-emitters/tika-emitter-solr/src/test/resources/log4j.properties
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/tika-async-log4j2.properties
@@ -13,10 +13,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-#info,debug, error,fatal ...
-log4j.rootLogger=debug,stderr
-#console
-log4j.appender.stderr=org.apache.log4j.ConsoleAppender
-log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
-log4j.appender.stderr.Target=System.err
-log4j.appender.stderr.layout.ConversionPattern=%-5p %m%n
+
+status=debug
+name=PropertiesConfig
+filters=threshold
+filter.threshold.type=ThresholdFilter
+filter.threshold.level=debug
+appenders=console
+appender.console.type=Console
+appender.console.name=STDERR
+appender.console.layout.type=PatternLayout
+appender.console.layout.pattern=%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n
+rootLogger.level=debug
+rootLogger.appenderRefs=stderr
+rootLogger.appenderRef.stderr.ref=STDERR
diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/pom.xml
b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/pom.xml
index cb80a34..347aa26 100644
--- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/pom.xml
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/pom.xml
@@ -41,29 +41,6 @@
<scope>provided</scope>
</dependency>
<dependency>
- <groupId>com.amazonaws</groupId>
- <artifactId>aws-java-sdk-s3</artifactId>
- <version>${aws.version}</version>
- <exclusions>
- <exclusion>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- </exclusion>
- <exclusion>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-core</artifactId>
- </exclusion>
- <exclusion>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-databind</artifactId>
- </exclusion>
- <exclusion>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>${commons.codec.version}</version>
@@ -79,14 +56,31 @@
<version>${commons.logging.version}</version>
</dependency>
<dependency>
- <groupId>org.apache.logging.log4j</groupId>
- <artifactId>log4j-slf4j-impl</artifactId>
- <version>${log4j2.version}</version>
- </dependency>
- <dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>${solrj.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-io</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-util</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-http</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
diff --git
a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/solrtest/SolrPipesIterator.java
b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/solrtest/SolrPipesIterator.java
index 78d6a49..85b070c 100644
---
a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/solrtest/SolrPipesIterator.java
+++
b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/solrtest/SolrPipesIterator.java
@@ -34,6 +34,9 @@ import org.apache.solr.client.solrj.impl.LBHttpSolrClient;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.params.CursorMarkParams;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import org.apache.tika.client.HttpClientFactory;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
@@ -45,8 +48,6 @@ import org.apache.tika.pipes.HandlerConfig;
import org.apache.tika.pipes.emitter.EmitKey;
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.pipes.pipesiterator.PipesIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
/**
* Iterates through results from a Solr query.
@@ -54,7 +55,7 @@ import org.slf4j.LoggerFactory;
public class SolrPipesIterator extends PipesIterator implements Initializable {
private static final Logger LOGGER =
LoggerFactory.getLogger(SolrPipesIterator.class);
-
+ private final HttpClientFactory httpClientFactory;
private String solrCollection;
/**
* You can specify solrUrls, or you can specify solrZkHosts and use use
zookeeper to determine the solr server urls.
@@ -72,8 +73,6 @@ public class SolrPipesIterator extends PipesIterator
implements Initializable {
private int connectionTimeout = 10000;
private int socketTimeout = 60000;
- private final HttpClientFactory httpClientFactory;
-
public SolrPipesIterator() throws TikaConfigException {
httpClientFactory = new HttpClientFactory();
}
@@ -203,7 +202,9 @@ public class SolrPipesIterator extends PipesIterator
implements Initializable {
QueryResponse qr = solrClient.query(solrCollection, query);
long totalToFetch = qr.getResults().getNumFound();
String nextCursorMark = qr.getNextCursorMark();
- LOGGER.info("Query to fetch files to parse collection={},
q={}, onCount={}, totalCount={}", solrCollection, query, fileCount,
totalToFetch);
+ LOGGER.info(
+ "Query to fetch files to parse collection={}, q={},
onCount={}, totalCount={}",
+ solrCollection, query, fileCount, totalToFetch);
for (SolrDocument sd : qr.getResults()) {
++fileCount;
String fetchKey = (String) sd.getFieldValue(idField);
@@ -213,11 +214,8 @@ public class SolrPipesIterator extends PipesIterator
implements Initializable {
metadata.add(nextField, (String)
sd.getFieldValue(nextField));
}
LOGGER.info("iterator doc: {}, idField={}, fetchKey={}",
sd, idField, fetchKey);
- tryToAdd(new FetchEmitTuple(fetchKey,
- new FetchKey(fetcherName, fetchKey),
- new EmitKey(emitterName, emitKey),
- new Metadata(),
- handlerConfig,
+ tryToAdd(new FetchEmitTuple(fetchKey, new
FetchKey(fetcherName, fetchKey),
+ new EmitKey(emitterName, emitKey), new Metadata(),
handlerConfig,
getOnParseException()));
}
if (cursorMark.equals(nextCursorMark)) {
@@ -234,14 +232,11 @@ public class SolrPipesIterator extends PipesIterator
implements Initializable {
if (solrUrls == null || solrUrls.isEmpty()) {
return new CloudSolrClient.Builder(solrZkHosts,
Optional.ofNullable(solrZkChroot))
.withHttpClient(httpClientFactory.build())
- .withConnectionTimeout(connectionTimeout)
- .withSocketTimeout(socketTimeout)
+
.withConnectionTimeout(connectionTimeout).withSocketTimeout(socketTimeout)
.build();
}
- return new LBHttpSolrClient.Builder()
- .withConnectionTimeout(connectionTimeout)
- .withSocketTimeout(socketTimeout)
- .withHttpClient(httpClientFactory.build())
+ return new
LBHttpSolrClient.Builder().withConnectionTimeout(connectionTimeout)
+
.withSocketTimeout(socketTimeout).withHttpClient(httpClientFactory.build())
.withBaseSolrUrls(solrUrls.toArray(new String[]{})).build();
}
@@ -254,11 +249,15 @@ public class SolrPipesIterator extends PipesIterator
implements Initializable {
mustNotBeEmpty("parsingIdField", this.parsingIdField);
mustNotBeEmpty("failCountField", this.failCountField);
mustNotBeEmpty("sizeFieldName", this.sizeFieldName);
- if ((this.solrUrls == null || this.solrUrls.isEmpty()) &&
(this.solrZkHosts == null || this.solrZkHosts.isEmpty())) {
- throw new IllegalArgumentException("expected either param solrUrls
or param solrZkHosts, but neither was specified");
+ if ((this.solrUrls == null || this.solrUrls.isEmpty()) &&
+ (this.solrZkHosts == null || this.solrZkHosts.isEmpty())) {
+ throw new IllegalArgumentException(
+ "expected either param solrUrls or param solrZkHosts, but
neither was specified");
}
- if (this.solrUrls != null && !this.solrUrls.isEmpty() &&
this.solrZkHosts != null && !this.solrZkHosts.isEmpty()) {
- throw new IllegalArgumentException("expected either param solrUrls
or param solrZkHosts, but both were specified");
+ if (this.solrUrls != null && !this.solrUrls.isEmpty() &&
this.solrZkHosts != null &&
+ !this.solrZkHosts.isEmpty()) {
+ throw new IllegalArgumentException(
+ "expected either param solrUrls or param solrZkHosts, but
both were specified");
}
}
}