Author: nick
Date: Sun Nov 15 19:53:14 2015
New Revision: 1714492

URL: http://svn.apache.org/viewvc?rev=1714492&view=rev
Log:
TIKA-1791 GeoParser fix for models in a jar file, from Thamme Gowda N. This 
closes #63 from GitHub

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1714492&r1=1714491&r2=1714492&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
 Sun Nov 15 19:53:14 2015
@@ -20,6 +20,7 @@ package org.apache.tika.parser.geo.topic
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URL;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
@@ -45,36 +46,64 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 public class GeoParser extends AbstractParser {
-
        private static final long serialVersionUID = -2241391757440215491L;
-       private static final MediaType MEDIA_TYPE = MediaType
-                       .application("geotopic");
-       private static final Set<MediaType> SUPPORTED_TYPES = Collections
-                       .singleton(MEDIA_TYPE);
+        private static final Logger LOG = 
Logger.getLogger(GeoParser.class.getName());
+       private static final MediaType MEDIA_TYPE = 
+                                   MediaType.application("geotopic");
+       private static final Set<MediaType> SUPPORTED_TYPES = 
+                                   Collections.singleton(MEDIA_TYPE);
        private GeoParserConfig config = new GeoParserConfig();
-       private static final Logger LOG = Logger.getLogger(GeoParser.class
-                       .getName());
+
+       private boolean initialized;
+       private URL modelUrl;
+       private NameEntityExtractor extractor;
+       private boolean available;
 
        @Override
        public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
                return SUPPORTED_TYPES;
        }
 
+       /**
+        * Initializes this parser
+        * @param modelUrl the URL to NER model
+        */
+       public void initialize(URL modelUrl) {
+
+               if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
+                       //previously initialized for the same URL
+                       return;
+               }
+               this.modelUrl = modelUrl;
+               //if NER model is available and lucene-geo-gazetteer is 
available
+               this.available = modelUrl != null &&
+                               ExternalParser.check(new String[] { 
"lucene-geo-gazetteer", "--help" }, -1);
+               if (this.available) {
+                       try {
+                               this.extractor = new 
NameEntityExtractor(modelUrl);
+                       } catch (Exception e) {
+                               e.printStackTrace();
+                               this.available = false;
+                       }
+               }
+               initialized = true;
+
+       }
+
        @Override
        public void parse(InputStream stream, ContentHandler handler,
-                       Metadata metadata, ParseContext context) throws 
IOException,
+                                         Metadata metadata, ParseContext 
context) throws IOException,
                        SAXException, TikaException {
 
                /*----------------configure this parser by ParseContext 
Object---------------------*/
-               config = context.get(GeoParserConfig.class, config);
-               String nerModelPath = config.getNERPath();
 
+               this.config = context.get(GeoParserConfig.class, config);
+               initialize(this.config.getNerModelUrl());
                if (!isAvailable()) {
                        return;
                }
 
                /*----------------get locationNameEntities and best nameEntity 
for the input stream---------------------*/
-               NameEntityExtractor extractor = new 
NameEntityExtractor(nerModelPath);
                extractor.getAllNameEntitiesfromInput(stream);
                extractor.getBestNameEntity();
                ArrayList<String> locationNameEntities = 
extractor.locationNameEntities;
@@ -146,10 +175,10 @@ public class GeoParser extends AbstractP
        }
 
        public boolean isAvailable() {
-               return ExternalParser.check(new String[] { 
"lucene-geo-gazetteer",
-                               "--help" }, -1)
-                               && config.getNERPath() != null
-                               && !config.getNERPath().equals("");
+               if (!initialized) {
+                       initialize(config.getNerModelUrl());
+               }
+               return this.available;
        }
 
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1714492&r1=1714491&r2=1714492&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
 Sun Nov 15 19:53:14 2015
@@ -19,22 +19,16 @@ package org.apache.tika.parser.geo.topic
 
 import java.io.File;
 import java.io.Serializable;
-import java.net.URISyntaxException;
+import java.net.MalformedURLException;
+import java.net.URL;
 
 public class GeoParserConfig implements Serializable {
 
-       private static final long serialVersionUID = 1L;
-       private String nerModelPath = null;
+       private static final long serialVersionUID = 2L;
+       private URL nerModelUrl = null;
 
        public GeoParserConfig() {
-               try {
-                       if 
(GeoParserConfig.class.getResource("en-ner-location.bin") != null) {
-                               this.nerModelPath = new 
File(GeoParserConfig.class.getResource(
-                                               
"en-ner-location.bin").toURI()).getAbsolutePath();
-                       }
-               } catch (URISyntaxException e) {
-                       e.printStackTrace();
-               }
+               this.nerModelUrl = 
GeoParserConfig.class.getResource("en-ner-location.bin");
        }
 
        public void setNERModelPath(String path) {
@@ -44,11 +38,19 @@ public class GeoParserConfig implements
                if (file.isDirectory() || !file.exists()) {
                        return;
                }
-               nerModelPath = path;
+               try {
+                       this.nerModelUrl = file.toURI().toURL();
+               } catch (MalformedURLException e) {
+                       throw new RuntimeException(e);
+               }
+       }
+
+       public void setNerModelUrl(URL url) {
+               this.nerModelUrl = url;
        }
 
-       public String getNERPath() {
-               return nerModelPath;
+       public URL getNerModelUrl() {
+               return nerModelUrl;
        }
 
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1714492&r1=1714491&r2=1714492&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
 Sun Nov 15 19:53:14 2015
@@ -17,9 +17,10 @@
 
 package org.apache.tika.parser.geo.topic;
 
-import java.io.FileInputStream;
+
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -30,25 +31,24 @@ import java.util.Map;
 
 import opennlp.tools.namefind.NameFinderME;
 import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.Span;
-
 import org.apache.commons.io.IOUtils;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 
 public class NameEntityExtractor {
-       private String nerModelPath = null;
+
        ArrayList<String> locationNameEntities;
        String bestNameEntity;
        private HashMap<String, Integer> tf;
+       private final NameFinderME nameFinder;
 
-       public NameEntityExtractor(String nerModelpath) {
+       public NameEntityExtractor(URL modelUrl) throws IOException {
                this.locationNameEntities = new ArrayList<String>();
                this.bestNameEntity = null;
-               this.nerModelPath = nerModelpath;
-               tf = new HashMap<String, Integer>();
-
+               TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
+               this.nameFinder = new NameFinderME(model);
+               this.tf = new HashMap<String, Integer>();
        }
 
        /*
@@ -60,18 +60,20 @@ public class NameEntityExtractor {
         */
 
        public void getAllNameEntitiesfromInput(InputStream stream)
-                       throws InvalidFormatException, IOException {
+                       throws IOException {
 
-               InputStream modelIn = new FileInputStream(nerModelPath);
-               TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
-               NameFinderME nameFinder = new NameFinderME(model);
-               String[] in = IOUtils.toString(stream, UTF_8).split(" ");
 
-               Span nameE[] = nameFinder.find(in);
+               String[] in = IOUtils.toString(stream, UTF_8).split(" ");
+               Span nameE[];
+               //name finder is not thread safe 
https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
+               synchronized (nameFinder) {
+                       nameE = nameFinder.find(in);
+                       //the same name finder is reused, so clear adaptive data
+                       nameFinder.clearAdaptiveData();
+               }
 
                String spanNames = Arrays.toString(Span.spansToStrings(nameE, 
in));
                spanNames = spanNames.substring(1, spanNames.length() - 1);
-               modelIn.close();
                String[] tmp = spanNames.split(",");
 
                for (String name : tmp) {
@@ -79,6 +81,7 @@ public class NameEntityExtractor {
                        this.locationNameEntities.add(name);
                }
 
+
        }
 
        /*
@@ -123,5 +126,4 @@ public class NameEntityExtractor {
                        }
                }
        }
-
 }


Reply via email to