Author: markg
Date: Mon Aug 18 14:49:42 2014
New Revision: 1618622

URL: http://svn.apache.org/r1618622
Log:
OPENNLP-706
fixed caching, ensured indexing and searching are using the same analyzer 
wrapper, included provinceproximity scorer

Modified:
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java

Modified: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1618622&r1=1618621&r2=1618622&view=diff
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
 (original)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
 Mon Aug 18 14:49:42 2014
@@ -40,6 +40,7 @@ import org.apache.lucene.util.Version;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import org.apache.log4j.Logger;
 import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
 import org.apache.lucene.analysis.util.CharArraySet;
 
 /**
@@ -56,12 +57,7 @@ public class GazetteerSearcher {
   private boolean doubleQuoteAllSearchTerms = false;
   private boolean useHierarchyField = false;
 
-  private IndexSearcher geonamesSearcher;// = new 
IndexSearcher(geonamesReader);
-  private Analyzer geonamesAnalyzer;
-  //usgs US gazateer
 
-  private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);
-  private Analyzer usgsAnalyzer;
   private EntityLinkerProperties properties;
 
   private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc));
@@ -167,7 +163,7 @@ public class GazetteerSearcher {
          */
         int maxLen = searchString.length() > entry.getItemName().length() ? 
searchString.length() : entry.getItemName().length();
 
-        Double normLev = Math.abs(1 - (sc / (double) 
maxLen));//searchString.length() / (double) entry.getItemName().length();
+        Double normLev = Math.abs(1 - (sc / (double) maxLen));
         /**
          * only want hits above the levenstein thresh. This should be a low
          * thresh due to the use of the hierarchy field in the index
@@ -226,7 +222,6 @@ public class GazetteerSearcher {
       opennlpIndex = new MMapDirectory(new File(indexloc));
       opennlpReader = DirectoryReader.open(opennlpIndex);
       opennlpSearcher = new IndexSearcher(opennlpReader);
-      //TODO: a language code switch statement should be employed here at some 
point
       opennlpAnalyzer
               = //new StandardAnalyzer(Version.LUCENE_48, new 
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
               new StandardAnalyzer(Version.LUCENE_48, new 
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
@@ -237,6 +232,11 @@ public class GazetteerSearcher {
       analyMap.put("loctype", new KeywordAnalyzer());
       analyMap.put("countycode", new KeywordAnalyzer());
       analyMap.put("gazsource", new KeywordAnalyzer());
+      
+      
+    opennlpAnalyzer
+            = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap);
+
 
       String cutoff = 
properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", 
String.valueOf(scoreCutoff));
       String usehierarchy = 
properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", 
String.valueOf("0"));

Modified: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1618622&r1=1618621&r2=1618622&view=diff
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
 (original)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
 Mon Aug 18 14:49:42 2014
@@ -27,6 +27,7 @@ import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer;
+import opennlp.addons.geoentitylinker.scoring.ProvinceProximityScorer;
 import opennlp.tools.entitylinker.BaseLink;
 import opennlp.tools.entitylinker.LinkedSpan;
 import opennlp.tools.util.Span;
@@ -89,7 +90,8 @@ public class GeoEntityLinker implements 
       }
     }
     /**
-     * sort the data with the best score on top based on the sum of the scores 
below from the score map for each baselink object
+     * sort the data with the best score on top based on the sum of the scores
+     * below from the score map for each baselink object
      */
     for (LinkedSpan<BaseLink> s : spans) {
       ArrayList<BaseLink> linkedData = s.getLinkedEntries();
@@ -124,6 +126,7 @@ public class GeoEntityLinker implements 
 
   private void loadScorers() {
     if (scorers.isEmpty()) {
+      scorers.add(new ProvinceProximityScorer());
       scorers.add(new GeoHashBinningScorer());
       scorers.add(new CountryProximityScorer());
       scorers.add(new ModelBasedScorer());
@@ -132,7 +135,6 @@ public class GeoEntityLinker implements 
     }
   }
 
-  
   @Override
   public void init(EntityLinkerProperties properties) throws IOException {
     try {

Modified: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java?rev=1618622&r1=1618621&r2=1618622&view=diff
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
 (original)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
 Mon Aug 18 14:49:42 2014
@@ -40,12 +40,12 @@ public class PlacetypeScorer implements 
 
   @Override
   public void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext 
additionalContext) {
-    for(LinkedSpan<GazetteerEntry> geospan : linkedSpans){
+    for (LinkedSpan<GazetteerEntry> geospan : linkedSpans) {
       ArrayList<GazetteerEntry> linkedEntries = geospan.getLinkedEntries();
       for (GazetteerEntry gazetteerEntry : linkedEntries) {
         String type = gazetteerEntry.getItemType().toLowerCase();
         Double score = getScore(type);
-        if(score==null){
+        if (score == null) {
           score = 0d;
         }
         gazetteerEntry.getScoreMap().put("typescore", score);
@@ -63,12 +63,14 @@ public class PlacetypeScorer implements 
       for (String type : boosts) {
         if (type.equals("PCLI")) {
           boosetedTypes.put(type.toLowerCase(), 1d);
-        } else if (type.startsWith("PCL") && !type.equals("PCLI")) {
+        } else if (type.startsWith("P") && !type.equals("PCLI")) {
           boosetedTypes.put(type.toLowerCase(), .5d);
         } else if (type.startsWith("ADM")) {
           boosetedTypes.put(type.toLowerCase(), .75d);
         }
+
       }
+      boosetedTypes.put("pplc", .9);
     }
   }
 

Modified: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java?rev=1618622&r1=1618621&r2=1618622&view=diff
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
 (original)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
 Mon Aug 18 14:49:42 2014
@@ -34,7 +34,7 @@ import opennlp.tools.util.Span;
  * heuristic that toponymn mentions are more likely close to their parent
  * province mentions. For instance, if the toponym Berlin is mentioned near an
  * indicator of Connecticut, it is more likely to be Berlin Connecticut than
- * Berlin Germany (if Germany did not exist in, or is mentioned further down 
in,
+ * Berlin Germany (if Germany did not exist in, or is mentioned further away in
  * the article).
  *
  *
@@ -51,7 +51,7 @@ public class ProvinceProximityScorer imp
     } else {
       for (LinkedSpan<BaseLink> span : linkedSpans) {
         for (BaseLink link : span.getLinkedEntries()) {
-          link.getScoreMap().put("provincecontext", Double.NaN);
+          link.getScoreMap().put("provincecontext", 0d);
         }
       }
     }
@@ -163,7 +163,7 @@ public class ProvinceProximityScorer imp
       if (scoreMap.containsKey(spanCountryCode)) {
 
         score = scoreMap.get(spanCountryCode);
-        ///does the name extracted match a country name?
+        ///does the name extracted match a province name?
         if (nameCodesMap.containsKey(entry.getItemName().toLowerCase())) {
           //if so, is it the correct country code for that name?
           if 
(nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode()))
 {
@@ -183,8 +183,8 @@ public class ProvinceProximityScorer imp
   }
 
   /**
-   * takes a map of distances from the toponym to each country mention and
-   * generates a map of scores for each country code. The map is then 
correlated
+   * takes a map of distances from the toponym to each province mention and
+   * generates a map of scores for each province code. The map is then 
correlated
    * to the code of the BaseLink parentid for retrieval. Then the score is 
added
    * to the overall list.
    *
@@ -232,7 +232,7 @@ public class ProvinceProximityScorer imp
    * together to smooth out the average, so one distant outlier does not kill
    * the score for an obviously good hit. More elegant solution is possible
    * using Math.pow, and making the score decay with distance by using an
-   * increasing negative exponent (I think)
+   * increasing negative exponent
    *
    * @param normDis the normalized and sorted set of distances as a list
    * @return


Reply via email to