[tika] branch master updated: TIKA-2824 -- general upgrades

tallison Mon, 22 Apr 2019 12:21:57 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/master by this push:
     new e83b999  TIKA-2824 -- general upgrades
e83b999 is described below

commit e83b999e0ab77971489356cca745746dd60aa55d
Author: TALLISON <[email protected]>
AuthorDate: Mon Apr 22 15:21:33 2019 -0400

    TIKA-2824 -- general upgrades
---
 CHANGES.txt                                        |  4 +-
 .../eval/tools/SlowCompositeReaderWrapper.java     | 77 +++++++++++++++-------
 .../java/org/apache/tika/example/RecentFiles.java  |  2 +-
 tika-langdetect/pom.xml                            |  2 +-
 tika-parent/pom.xml                                | 11 ++--
 tika-parsers/pom.xml                               | 30 +++++----
 .../org/apache/tika/parser/sas/SAS7BDATParser.java | 10 ++-
 7 files changed, 89 insertions(+), 47 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index af8fcd0..e56a0ad 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -24,7 +24,9 @@ Release 1.21 - ????
    * Add a CSVParser.  CSV detection is currently based solely on filename
      and/or information conveyed via Metadata (TIKA-2826).
 
-   * General upgrades: h2, jackcess, opennlp, httpcomponents, zstd-jni, cxf, 
Lucene (TIKA-2824)
+   * General upgrades: asm, bouncycastle, commons-codec, commons-lang3, cxf,
+     guava, h2, httpcomponents, jackcess, junrar, Lucene, mime4j, opennlp, 
parso,
+     sqlite-jdbc (provided), zstd-jni (provided) (TIKA-2824)
 
    * Bundle xerces2 with tika-parsers (TIKA-2802).
 
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
index 299f90f..0ddbea4 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
@@ -16,29 +16,52 @@
  */
 package org.apache.tika.eval.tools;
 
-
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.lucene.index.*;
 import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
-import org.apache.lucene.index.OrdinalMap;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.Version;
 
-
 /**
- * copied verbatim from Solr
+ * <b>COPIED VERBATIM FROM LUCENE</b>
+ * This class forces a composite reader (eg a {@link
+ * MultiReader} or {@link DirectoryReader}) to emulate a
+ * {@link LeafReader}.  This requires implementing the postings
+ * APIs on-the-fly, using the static methods in {@link
+ * MultiTerms}, {@link MultiDocValues}, by stepping through
+ * the sub-readers to merge fields/terms, appending docs, etc.
+ *
+ * <p><b>NOTE</b>: this class almost always results in a
+ * performance hit.  If this is important to your use case,
+ * you'll get better performance by gathering the sub readers using
+ * {@link IndexReader#getContext()} to get the
+ * leaves and then operate per-LeafReader,
+ * instead of using this class.
  */
-final class SlowCompositeReaderWrapper extends LeafReader {
+
+public final class SlowCompositeReaderWrapper extends LeafReader {
 
     private final CompositeReader in;
-    private final Fields fields;
     private final LeafMetaData metaData;
 
+    // Cached copy of FieldInfos to prevent it from being re-created on each
+    // getFieldInfos call.  Most (if not all) other LeafReader implementations
+    // also have a cached FieldInfos instance so this is consistent. SOLR-12878
+    private final FieldInfos fieldInfos;
+
+    final Map<String,Terms> cachedTerms = new ConcurrentHashMap<>();
+
+    // TODO: consider ConcurrentHashMap ?
+    // TODO: this could really be a weak map somewhere else on the 
coreCacheKey,
+    // but do we really need to optimize slow-wrapper any more?
+    final Map<String,OrdinalMap> cachedOrdMaps = new HashMap<>();
+
     /** This method is sugar for getting an {@link LeafReader} from
-     * an IndexReader of any kind. If the reader is already atomic,
+     * an {@link IndexReader} of any kind. If the reader is already atomic,
      * it is returned unchanged, otherwise wrapped by this class.
      */
     public static LeafReader wrap(IndexReader reader) throws IOException {
@@ -51,9 +74,7 @@ final class SlowCompositeReaderWrapper extends LeafReader {
     }
 
     SlowCompositeReaderWrapper(CompositeReader reader) throws IOException {
-        super();
         in = reader;
-        fields = MultiFields.getFields(in);
         in.registerParentReader(this);
         if (reader.leaves().isEmpty()) {
             metaData = new LeafMetaData(Version.LATEST.major, Version.LATEST, 
null);
@@ -70,6 +91,7 @@ final class SlowCompositeReaderWrapper extends LeafReader {
             }
             metaData = new 
LeafMetaData(reader.leaves().get(0).reader().getMetaData().getCreatedVersionMajor(),
 minVersion, null);
         }
+        fieldInfos = FieldInfos.getMergedFieldInfos(in);
     }
 
     @Override
@@ -93,25 +115,38 @@ final class SlowCompositeReaderWrapper extends LeafReader {
     @Override
     public Terms terms(String field) throws IOException {
         ensureOpen();
-        return fields.terms(field);
+        try {
+            return cachedTerms.computeIfAbsent(field, f -> {
+                try {
+                    return MultiTerms.getTerms(in, f);
+                } catch (IOException e) { // yuck!  ...sigh... checked 
exceptions with built-in lambdas are a pain
+                    throw new RuntimeException("unwrapMe", e);
+                }
+            });
+        } catch (RuntimeException e) {
+            if (e.getMessage().equals("unwrapMe") && e.getCause() instanceof 
IOException) {
+                throw (IOException) e.getCause();
+            }
+            throw e;
+        }
     }
 
     @Override
     public NumericDocValues getNumericDocValues(String field) throws 
IOException {
         ensureOpen();
-        return MultiDocValues.getNumericValues(in, field);
+        return MultiDocValues.getNumericValues(in, field); // TODO cache?
     }
 
     @Override
     public BinaryDocValues getBinaryDocValues(String field) throws IOException 
{
         ensureOpen();
-        return MultiDocValues.getBinaryValues(in, field);
+        return MultiDocValues.getBinaryValues(in, field); // TODO cache?
     }
 
     @Override
     public SortedNumericDocValues getSortedNumericDocValues(String field) 
throws IOException {
         ensureOpen();
-        return MultiDocValues.getSortedNumericValues(in, field);
+        return MultiDocValues.getSortedNumericValues(in, field); // TODO cache?
     }
 
     @Override
@@ -200,14 +235,10 @@ final class SlowCompositeReaderWrapper extends LeafReader 
{
         return new MultiDocValues.MultiSortedSetDocValues(values, starts, map, 
cost);
     }
 
-    // TODO: this could really be a weak map somewhere else on the 
coreCacheKey,
-    // but do we really need to optimize slow-wrapper any more?
-    final Map<String,OrdinalMap> cachedOrdMaps = new HashMap<>();
-
     @Override
     public NumericDocValues getNormValues(String field) throws IOException {
         ensureOpen();
-        return MultiDocValues.getNormValues(in, field);
+        return MultiDocValues.getNormValues(in, field); // TODO cache?
     }
 
     @Override
@@ -237,19 +268,18 @@ final class SlowCompositeReaderWrapper extends LeafReader 
{
     @Override
     public Bits getLiveDocs() {
         ensureOpen();
-        return MultiFields.getLiveDocs(in);
+        return MultiBits.getLiveDocs(in); // TODO cache?
     }
 
     @Override
     public PointValues getPointValues(String field) {
         ensureOpen();
-        return null;
+        return null; // because not supported.  Throw UOE?
     }
 
     @Override
     public FieldInfos getFieldInfos() {
-        ensureOpen();
-        return MultiFields.getMergedFieldInfos(in);
+        return fieldInfos;
     }
 
     @Override
@@ -270,5 +300,4 @@ final class SlowCompositeReaderWrapper extends LeafReader {
     public LeafMetaData getMetaData() {
         return metaData;
     }
-}
-
+}
\ No newline at end of file
diff --git 
a/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java 
b/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java
index f3a10f9..5a8b765 100755
--- a/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java
+++ b/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java
@@ -68,7 +68,7 @@ public class RecentFiles {
                     TikaCoreProperties.CREATED.getName(),
                     new BytesRef(fiveMinsAgo), new BytesRef(nowDateTime),
                     true, true);
-            TopScoreDocCollector collector = TopScoreDocCollector.create(20);
+            TopScoreDocCollector collector = 
TopScoreDocCollector.create(20,10000);
             searcher.search(query, collector);
             ScoreDoc[] hits = collector.topDocs().scoreDocs;
             for (int i = 0; i < hits.length; i++) {
diff --git a/tika-langdetect/pom.xml b/tika-langdetect/pom.xml
index 3946ff9..f5b31bb 100644
--- a/tika-langdetect/pom.xml
+++ b/tika-langdetect/pom.xml
@@ -77,7 +77,7 @@
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
-      <version>27.0.1-jre</version>
+      <version>${guava.version}</version>
     </dependency>
     <dependency>
       <groupId>org.apache.cxf</groupId>
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 2f43fe9..9031dc1 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -269,7 +269,7 @@
       <dependency>
         <groupId>junit</groupId>
         <artifactId>junit</artifactId>
-        <version>4.12</version>
+        <version>4.13-beta-2</version>
         <scope>test</scope>
       </dependency>
       <dependency>
@@ -321,19 +321,20 @@
     <poi.version>4.0.1</poi.version>
     <commons.compress.version>1.18</commons.compress.version>
     <commons.io.version>2.6</commons.io.version>
-    <commons.lang3.version>3.8.1</commons.lang3.version>
+    <commons.lang3.version>3.9</commons.lang3.version>
     <gson.version>2.8.5</gson.version>
+    <guava.version>27.1-jre</guava.version>
     <osgi.core.version>6.0.0</osgi.core.version>
 
-    <cxf.version>3.3.0</cxf.version>
+    <cxf.version>3.3.1</cxf.version>
     <slf4j.version>1.7.25</slf4j.version>
     <jackson.version>2.9.8</jackson.version>
     <!-- when this is next upgraded, see if we can get rid of
          javax.activation dependency in tika-server -->
     <jaxb.version>2.3.2</jaxb.version>
     <cli.version>1.4</cli.version>
-    <lucene.version>7.7.0</lucene.version>
-    <mockito.version>2.23.4</mockito.version>
+    <lucene.version>8.0.0</lucene.version>
+    <mockito.version>2.27.0</mockito.version>
   </properties>
 
   <build>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index a50a02b..c81ea66 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -36,22 +36,22 @@
 
   <properties>
     <!-- NOTE: sync codec version with POI -->
-    <codec.version>1.11</codec.version>
+    <codec.version>1.12</codec.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
     <tukaani.version>1.8</tukaani.version>
     <!-- NOTE: sync brotli version with commons-compress in tika-parent-->
     <brotli.version>0.1.2</brotli.version>
-    <mime4j.version>0.8.2</mime4j.version>
+    <mime4j.version>0.8.3</mime4j.version>
     <vorbis.version>0.8</vorbis.version>
     <pdfbox.version>2.0.15</pdfbox.version>
     <jempbox.version>1.8.16</jempbox.version>
     <netcdf-java.version>4.5.5</netcdf-java.version>
     <sis.version>0.8</sis.version>
-    <parso.version>2.0.10</parso.version>
+    <parso.version>2.0.11</parso.version>
     <!-- used by POI, PDFBox and Jackcess ...try to sync -->
-    <bouncycastle.version>1.60</bouncycastle.version>
+    <bouncycastle.version>1.61</bouncycastle.version>
     <commonsexec.version>1.3</commonsexec.version>
-    <httpcomponents.version>4.5.7</httpcomponents.version>
+    <httpcomponents.version>4.5.8</httpcomponents.version>
   </properties>
 
   <dependencies>
@@ -173,7 +173,7 @@
     <dependency>
       <groupId>com.github.luben</groupId>
       <artifactId>zstd-jni</artifactId>
-      <version>1.3.8-3</version>
+      <version>1.4.0-1</version>
       <scope>provided</scope>
     </dependency>
 
@@ -260,9 +260,13 @@
     <dependency>
       <groupId>com.healthmarketscience.jackcess</groupId>
       <artifactId>jackcess</artifactId>
-      <version>3.0.0</version>
+      <version>3.0.1</version>
       <exclusions>
         <exclusion>
+          <groupId>org.apache.commons</groupId>
+          <artifactId>commons-lang3</artifactId>
+        </exclusion>
+        <exclusion>
           <groupId>commons-logging</groupId>
           <artifactId>commons-logging</artifactId>
         </exclusion>
@@ -293,7 +297,7 @@
     <dependency>
       <groupId>org.ow2.asm</groupId>
       <artifactId>asm</artifactId>
-      <version>7.0</version>
+      <version>7.1</version>
     </dependency>
     <dependency>
       <groupId>com.googlecode.mp4parser</groupId>
@@ -352,7 +356,7 @@
     <dependency>
       <groupId>com.github.junrar</groupId>
       <artifactId>junrar</artifactId>
-      <version>2.0.0</version>
+      <version>4.0.0</version>
       <exclusions>
         <exclusion>
           <groupId>commons-logging</groupId>
@@ -393,7 +397,7 @@
     <dependency>
       <groupId>org.xerial</groupId>
       <artifactId>sqlite-jdbc</artifactId>
-      <version>3.25.2</version>
+      <version>3.27.2.1</version>
       <scope>provided</scope>
     </dependency>
 
@@ -503,7 +507,7 @@
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
-      <version>27.0.1-jre</version>
+      <version>${guava.version}</version>
     </dependency>
     <dependency>
       <groupId>edu.ucar</groupId>
@@ -533,7 +537,7 @@
     <dependency>
       <groupId>net.java.dev.jna</groupId>
       <artifactId>jna</artifactId>
-      <version>5.1.0</version>
+      <version>5.2.0</version>
     </dependency>
 
     <!-- grib's current jsoup is vulnerable to xss
@@ -546,7 +550,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>3.6.1</version>
+      <version>3.7.1</version>
     </dependency>
     <dependency>
       <groupId>edu.ucar</groupId>
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
index 8b28644..f6b7efc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
@@ -18,7 +18,10 @@ package org.apache.tika.parser.sas;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.text.Format;
 import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.tika.exception.TikaException;
@@ -128,12 +131,15 @@ public class SAS7BDATParser extends AbstractParser {
         }
         xhtml.endElement("tr");
         xhtml.newline();
-        
+
+        //TODO: initialize this on the first row and then apply
+        Map<Integer, Format> formatMap = new HashMap<>();
+
         // Process each row in turn
         Object[] row = null;
         while ((row = sas.readNext()) != null) {
             xhtml.startElement("tr");
-            for (String val : DataWriterUtil.getRowValues(sas.getColumns(), 
row)) {
+            for (String val : DataWriterUtil.getRowValues(sas.getColumns(), 
row, formatMap)) {
                 // Use explicit start/end, rather than element, to 
                 //  ensure that empty cells still get output
                 xhtml.startElement("td");

[tika] branch master updated: TIKA-2824 -- general upgrades

Reply via email to