This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new e83b999 TIKA-2824 -- general upgrades
e83b999 is described below
commit e83b999e0ab77971489356cca745746dd60aa55d
Author: TALLISON <[email protected]>
AuthorDate: Mon Apr 22 15:21:33 2019 -0400
TIKA-2824 -- general upgrades
---
CHANGES.txt | 4 +-
.../eval/tools/SlowCompositeReaderWrapper.java | 77 +++++++++++++++-------
.../java/org/apache/tika/example/RecentFiles.java | 2 +-
tika-langdetect/pom.xml | 2 +-
tika-parent/pom.xml | 11 ++--
tika-parsers/pom.xml | 30 +++++----
.../org/apache/tika/parser/sas/SAS7BDATParser.java | 10 ++-
7 files changed, 89 insertions(+), 47 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index af8fcd0..e56a0ad 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -24,7 +24,9 @@ Release 1.21 - ????
* Add a CSVParser. CSV detection is currently based solely on filename
and/or information conveyed via Metadata (TIKA-2826).
- * General upgrades: h2, jackcess, opennlp, httpcomponents, zstd-jni, cxf,
Lucene (TIKA-2824)
+ * General upgrades: asm, bouncycastle, commons-codec, commons-lang3, cxf,
+ guava, h2, httpcomponents, jackcess, junrar, Lucene, mime4j, opennlp,
parso,
+ sqlite-jdbc (provided), zstd-jni (provided) (TIKA-2824)
* Bundle xerces2 with tika-parsers (TIKA-2802).
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
b/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
index 299f90f..0ddbea4 100644
---
a/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
+++
b/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
@@ -16,29 +16,52 @@
*/
package org.apache.tika.eval.tools;
-
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.index.*;
import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
-import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Version;
-
/**
- * copied verbatim from Solr
+ * <b>COPIED VERBATIM FROM LUCENE</b>
+ * This class forces a composite reader (eg a {@link
+ * MultiReader} or {@link DirectoryReader}) to emulate a
+ * {@link LeafReader}. This requires implementing the postings
+ * APIs on-the-fly, using the static methods in {@link
+ * MultiTerms}, {@link MultiDocValues}, by stepping through
+ * the sub-readers to merge fields/terms, appending docs, etc.
+ *
+ * <p><b>NOTE</b>: this class almost always results in a
+ * performance hit. If this is important to your use case,
+ * you'll get better performance by gathering the sub readers using
+ * {@link IndexReader#getContext()} to get the
+ * leaves and then operate per-LeafReader,
+ * instead of using this class.
*/
-final class SlowCompositeReaderWrapper extends LeafReader {
+
+public final class SlowCompositeReaderWrapper extends LeafReader {
private final CompositeReader in;
- private final Fields fields;
private final LeafMetaData metaData;
+ // Cached copy of FieldInfos to prevent it from being re-created on each
+ // getFieldInfos call. Most (if not all) other LeafReader implementations
+ // also have a cached FieldInfos instance so this is consistent. SOLR-12878
+ private final FieldInfos fieldInfos;
+
+ final Map<String,Terms> cachedTerms = new ConcurrentHashMap<>();
+
+ // TODO: consider ConcurrentHashMap ?
+ // TODO: this could really be a weak map somewhere else on the
coreCacheKey,
+ // but do we really need to optimize slow-wrapper any more?
+ final Map<String,OrdinalMap> cachedOrdMaps = new HashMap<>();
+
/** This method is sugar for getting an {@link LeafReader} from
- * an IndexReader of any kind. If the reader is already atomic,
+ * an {@link IndexReader} of any kind. If the reader is already atomic,
* it is returned unchanged, otherwise wrapped by this class.
*/
public static LeafReader wrap(IndexReader reader) throws IOException {
@@ -51,9 +74,7 @@ final class SlowCompositeReaderWrapper extends LeafReader {
}
SlowCompositeReaderWrapper(CompositeReader reader) throws IOException {
- super();
in = reader;
- fields = MultiFields.getFields(in);
in.registerParentReader(this);
if (reader.leaves().isEmpty()) {
metaData = new LeafMetaData(Version.LATEST.major, Version.LATEST,
null);
@@ -70,6 +91,7 @@ final class SlowCompositeReaderWrapper extends LeafReader {
}
metaData = new
LeafMetaData(reader.leaves().get(0).reader().getMetaData().getCreatedVersionMajor(),
minVersion, null);
}
+ fieldInfos = FieldInfos.getMergedFieldInfos(in);
}
@Override
@@ -93,25 +115,38 @@ final class SlowCompositeReaderWrapper extends LeafReader {
@Override
public Terms terms(String field) throws IOException {
ensureOpen();
- return fields.terms(field);
+ try {
+ return cachedTerms.computeIfAbsent(field, f -> {
+ try {
+ return MultiTerms.getTerms(in, f);
+ } catch (IOException e) { // yuck! ...sigh... checked
exceptions with built-in lambdas are a pain
+ throw new RuntimeException("unwrapMe", e);
+ }
+ });
+ } catch (RuntimeException e) {
+ if (e.getMessage().equals("unwrapMe") && e.getCause() instanceof
IOException) {
+ throw (IOException) e.getCause();
+ }
+ throw e;
+ }
}
@Override
public NumericDocValues getNumericDocValues(String field) throws
IOException {
ensureOpen();
- return MultiDocValues.getNumericValues(in, field);
+ return MultiDocValues.getNumericValues(in, field); // TODO cache?
}
@Override
public BinaryDocValues getBinaryDocValues(String field) throws IOException
{
ensureOpen();
- return MultiDocValues.getBinaryValues(in, field);
+ return MultiDocValues.getBinaryValues(in, field); // TODO cache?
}
@Override
public SortedNumericDocValues getSortedNumericDocValues(String field)
throws IOException {
ensureOpen();
- return MultiDocValues.getSortedNumericValues(in, field);
+ return MultiDocValues.getSortedNumericValues(in, field); // TODO cache?
}
@Override
@@ -200,14 +235,10 @@ final class SlowCompositeReaderWrapper extends LeafReader
{
return new MultiDocValues.MultiSortedSetDocValues(values, starts, map,
cost);
}
- // TODO: this could really be a weak map somewhere else on the
coreCacheKey,
- // but do we really need to optimize slow-wrapper any more?
- final Map<String,OrdinalMap> cachedOrdMaps = new HashMap<>();
-
@Override
public NumericDocValues getNormValues(String field) throws IOException {
ensureOpen();
- return MultiDocValues.getNormValues(in, field);
+ return MultiDocValues.getNormValues(in, field); // TODO cache?
}
@Override
@@ -237,19 +268,18 @@ final class SlowCompositeReaderWrapper extends LeafReader
{
@Override
public Bits getLiveDocs() {
ensureOpen();
- return MultiFields.getLiveDocs(in);
+ return MultiBits.getLiveDocs(in); // TODO cache?
}
@Override
public PointValues getPointValues(String field) {
ensureOpen();
- return null;
+ return null; // because not supported. Throw UOE?
}
@Override
public FieldInfos getFieldInfos() {
- ensureOpen();
- return MultiFields.getMergedFieldInfos(in);
+ return fieldInfos;
}
@Override
@@ -270,5 +300,4 @@ final class SlowCompositeReaderWrapper extends LeafReader {
public LeafMetaData getMetaData() {
return metaData;
}
-}
-
+}
\ No newline at end of file
diff --git
a/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java
b/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java
index f3a10f9..5a8b765 100755
--- a/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java
+++ b/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java
@@ -68,7 +68,7 @@ public class RecentFiles {
TikaCoreProperties.CREATED.getName(),
new BytesRef(fiveMinsAgo), new BytesRef(nowDateTime),
true, true);
- TopScoreDocCollector collector = TopScoreDocCollector.create(20);
+ TopScoreDocCollector collector =
TopScoreDocCollector.create(20,10000);
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
for (int i = 0; i < hits.length; i++) {
diff --git a/tika-langdetect/pom.xml b/tika-langdetect/pom.xml
index 3946ff9..f5b31bb 100644
--- a/tika-langdetect/pom.xml
+++ b/tika-langdetect/pom.xml
@@ -77,7 +77,7 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
- <version>27.0.1-jre</version>
+ <version>${guava.version}</version>
</dependency>
<dependency>
<groupId>org.apache.cxf</groupId>
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 2f43fe9..9031dc1 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -269,7 +269,7 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
- <version>4.12</version>
+ <version>4.13-beta-2</version>
<scope>test</scope>
</dependency>
<dependency>
@@ -321,19 +321,20 @@
<poi.version>4.0.1</poi.version>
<commons.compress.version>1.18</commons.compress.version>
<commons.io.version>2.6</commons.io.version>
- <commons.lang3.version>3.8.1</commons.lang3.version>
+ <commons.lang3.version>3.9</commons.lang3.version>
<gson.version>2.8.5</gson.version>
+ <guava.version>27.1-jre</guava.version>
<osgi.core.version>6.0.0</osgi.core.version>
- <cxf.version>3.3.0</cxf.version>
+ <cxf.version>3.3.1</cxf.version>
<slf4j.version>1.7.25</slf4j.version>
<jackson.version>2.9.8</jackson.version>
<!-- when this is next upgraded, see if we can get rid of
javax.activation dependency in tika-server -->
<jaxb.version>2.3.2</jaxb.version>
<cli.version>1.4</cli.version>
- <lucene.version>7.7.0</lucene.version>
- <mockito.version>2.23.4</mockito.version>
+ <lucene.version>8.0.0</lucene.version>
+ <mockito.version>2.27.0</mockito.version>
</properties>
<build>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index a50a02b..c81ea66 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -36,22 +36,22 @@
<properties>
<!-- NOTE: sync codec version with POI -->
- <codec.version>1.11</codec.version>
+ <codec.version>1.12</codec.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
<tukaani.version>1.8</tukaani.version>
<!-- NOTE: sync brotli version with commons-compress in tika-parent-->
<brotli.version>0.1.2</brotli.version>
- <mime4j.version>0.8.2</mime4j.version>
+ <mime4j.version>0.8.3</mime4j.version>
<vorbis.version>0.8</vorbis.version>
<pdfbox.version>2.0.15</pdfbox.version>
<jempbox.version>1.8.16</jempbox.version>
<netcdf-java.version>4.5.5</netcdf-java.version>
<sis.version>0.8</sis.version>
- <parso.version>2.0.10</parso.version>
+ <parso.version>2.0.11</parso.version>
<!-- used by POI, PDFBox and Jackcess ...try to sync -->
- <bouncycastle.version>1.60</bouncycastle.version>
+ <bouncycastle.version>1.61</bouncycastle.version>
<commonsexec.version>1.3</commonsexec.version>
- <httpcomponents.version>4.5.7</httpcomponents.version>
+ <httpcomponents.version>4.5.8</httpcomponents.version>
</properties>
<dependencies>
@@ -173,7 +173,7 @@
<dependency>
<groupId>com.github.luben</groupId>
<artifactId>zstd-jni</artifactId>
- <version>1.3.8-3</version>
+ <version>1.4.0-1</version>
<scope>provided</scope>
</dependency>
@@ -260,9 +260,13 @@
<dependency>
<groupId>com.healthmarketscience.jackcess</groupId>
<artifactId>jackcess</artifactId>
- <version>3.0.0</version>
+ <version>3.0.1</version>
<exclusions>
<exclusion>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-lang3</artifactId>
+ </exclusion>
+ <exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
@@ -293,7 +297,7 @@
<dependency>
<groupId>org.ow2.asm</groupId>
<artifactId>asm</artifactId>
- <version>7.0</version>
+ <version>7.1</version>
</dependency>
<dependency>
<groupId>com.googlecode.mp4parser</groupId>
@@ -352,7 +356,7 @@
<dependency>
<groupId>com.github.junrar</groupId>
<artifactId>junrar</artifactId>
- <version>2.0.0</version>
+ <version>4.0.0</version>
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
@@ -393,7 +397,7 @@
<dependency>
<groupId>org.xerial</groupId>
<artifactId>sqlite-jdbc</artifactId>
- <version>3.25.2</version>
+ <version>3.27.2.1</version>
<scope>provided</scope>
</dependency>
@@ -503,7 +507,7 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
- <version>27.0.1-jre</version>
+ <version>${guava.version}</version>
</dependency>
<dependency>
<groupId>edu.ucar</groupId>
@@ -533,7 +537,7 @@
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
- <version>5.1.0</version>
+ <version>5.2.0</version>
</dependency>
<!-- grib's current jsoup is vulnerable to xss
@@ -546,7 +550,7 @@
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
- <version>3.6.1</version>
+ <version>3.7.1</version>
</dependency>
<dependency>
<groupId>edu.ucar</groupId>
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
index 8b28644..f6b7efc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
@@ -18,7 +18,10 @@ package org.apache.tika.parser.sas;
import java.io.IOException;
import java.io.InputStream;
+import java.text.Format;
import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
import java.util.Set;
import org.apache.tika.exception.TikaException;
@@ -128,12 +131,15 @@ public class SAS7BDATParser extends AbstractParser {
}
xhtml.endElement("tr");
xhtml.newline();
-
+
+ //TODO: initialize this on the first row and then apply
+ Map<Integer, Format> formatMap = new HashMap<>();
+
// Process each row in turn
Object[] row = null;
while ((row = sas.readNext()) != null) {
xhtml.startElement("tr");
- for (String val : DataWriterUtil.getRowValues(sas.getColumns(),
row)) {
+ for (String val : DataWriterUtil.getRowValues(sas.getColumns(),
row, formatMap)) {
// Use explicit start/end, rather than element, to
// ensure that empty cells still get output
xhtml.startElement("td");