Author: markg
Date: Fri Aug 15 18:10:51 2014
New Revision: 1618251

URL: http://svn.apache.org/r1618251
Log:
OPENNLP-706
Significant fix to the indexing so that country names are properly discovered. 
Added a typeboosting scorer, and added descending sort to the output of each 
call to the geoentitylinker. Also did some general cleanup. Made configurable 
how many matches are returned from the gazetteer via a property.

Added:
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
Modified:
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
    
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java

Modified: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java?rev=1618251&r1=1618250&r2=1618251&view=diff
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
 (original)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
 Fri Aug 15 18:10:51 2014
@@ -117,16 +117,16 @@ public class AdminBoundaryContext {
       }
       Map<String, String> provs = new HashMap<>();
 
-      if (!provsForCountry.isEmpty()) {
-        for (String pcode : provsForCountry.keySet()) {
-          if (this.getProvHits().contains(pcode)) {
-            provs.put(pcode, provsForCountry.get(pcode));
-
-            clauses.add(" countrycode:" + countryCode + " AND admincode:" + 
pcode + gazType);
-
-          }
-        }
-      }
+//      if (!provsForCountry.isEmpty()) {
+//        for (String pcode : provsForCountry.keySet()) {
+//          if (this.getProvHits().contains(pcode)) {
+//            provs.put(pcode, provsForCountry.get(pcode));
+//
+//            clauses.add(" countrycode:" + countryCode + " AND admincode:" + 
pcode + gazType);
+//
+//          }
+//        }
+//      }
       if (provs.isEmpty()) {
         //got a country with no mentioned provs
         clauses.add(" countrycode:" + countryCode + gazType);

Modified: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1618251&r1=1618250&r2=1618251&view=diff
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
 (original)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
 Fri Aug 15 18:10:51 2014
@@ -18,7 +18,9 @@ package opennlp.addons.geoentitylinker;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.logging.Level;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -37,31 +39,28 @@ import org.apache.lucene.store.MMapDirec
 import org.apache.lucene.util.Version;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
 import org.apache.lucene.analysis.util.CharArraySet;
 
 /**
  *
  * Searches Gazetteers stored in a MMapDirectory Lucene index. The structure of
- * these indices are based on loading the indexes using the
- * GeoEntityLinkerSetupUtils
+ * these indices are based on loading the indexes using the GazetteerIndexer
  *
  */
 public class GazetteerSearcher {
 
   //private static final String boostedTerms = " AND loctype(ADM1^1 ADM1H^1 
ADM2^1 ADM2H^1 ADM3^1 ADM3H^1 ADM4^1 ADM4H^1 ADM5^1 ADMD^1 ADMDH^1 PCLD^1 
PCLH^1 PCLI^1 PCLIX^1 TERR^1 PCLIX^1 PPL^1 PPLA^1 PPLA2^1 PPLA3^1 PPLA4^1 
PPLC^1 PPLCH^1 PPLF^1 PPLG^1 PPLH^1 PPLL^1 PPLQ^1 PPLR^1 PPLS^1 PPLX^1 STLMT^1) 
";
-
   private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";
   private static final Logger LOGGER = 
Logger.getLogger(GazetteerSearcher.class);
   private double scoreCutoff = .70;
-  private boolean doubleQuoteAllSearchTerms = true;
+  private boolean doubleQuoteAllSearchTerms = false;
   private boolean useHierarchyField = false;
-  private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
-  private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
+
   private IndexSearcher geonamesSearcher;// = new 
IndexSearcher(geonamesReader);
   private Analyzer geonamesAnalyzer;
   //usgs US gazateer
-  private Directory usgsIndex;//= new MMapDirectory(new File(indexloc));
-  private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);
+
   private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);
   private Analyzer usgsAnalyzer;
   private EntityLinkerProperties properties;
@@ -75,7 +74,7 @@ public class GazetteerSearcher {
     try {
       boolean b = Boolean.valueOf("true");
 
-      new GazetteerSearcher(new EntityLinkerProperties(new 
File("c:\\temp\\entitylinker.properties"))).geonamesFind("baghdad", 5, "iz");
+      new GazetteerSearcher(new EntityLinkerProperties(new 
File("c:\\temp\\entitylinker.properties"))).find("italy", 5, " countrycode:it 
AND gazsource:geonames");
     } catch (IOException ex) {
       
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE,
 null, ex);
     } catch (Exception ex) {
@@ -108,12 +107,12 @@ public class GazetteerSearcher {
        * build the search string Sometimes no country context is found. In this
        * case the code variables will be empty strings
        */
-      String placeNameQueryString = "placename:(" + searchString.toLowerCase() 
+ ") AND " + whereClause;
+      String placeNameQueryString = "placename:(" + searchString.toLowerCase() 
+ ")" + "AND " + whereClause;
       if (searchString.trim().contains(" ") && useHierarchyField) {
         placeNameQueryString = "(placename:(" + searchString.toLowerCase() + 
") AND hierarchy:(" + formatForHierarchy(searchString) + "))"
                 + " AND " + whereClause;
       }
-       
+
       /**
        * check the cache and go no further if the records already exist
        */
@@ -127,6 +126,7 @@ public class GazetteerSearcher {
        */
       QueryParser parser = new QueryParser(Version.LUCENE_48, 
placeNameQueryString, opennlpAnalyzer);
       Query q = parser.parse(placeNameQueryString);
+      //Filter filter = new QueryWrapperFilter(new 
QueryParser(Version.LUCENE_48, whereClause, 
opennlpAnalyzer).parse(whereClause));      
 
       TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned);
 
@@ -173,213 +173,8 @@ public class GazetteerSearcher {
          * only want hits above the levenstein thresh. This should be a low
          * thresh due to the use of the hierarchy field in the index
          */
-        if (normLev > scoreCutoff) {
-          if 
(entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || 
parentid.toLowerCase().equals("")) {
-            entry.getScoreMap().put("normlucene", normLev);
-            //make sure we don't produce a duplicate
-            if (!linkedData.contains(entry)) {
-              linkedData.add(entry);
-              /**
-               * add the records to the cache for this query
-               */
-              GazetteerSearchCache.put(placeNameQueryString, linkedData);
-            }
-          }
-        }
-      }
-
-    } catch (IOException | ParseException ex) {
-      LOGGER.error(ex);
-    }
-
-    return linkedData;
-  }
-
-  /**
-   *
-   * @param searchString the named entity to look up in the lucene index
-   * @param rowsReturned how many rows to allow lucene to return
-   * @param code the country code
-   *
-   * @return
-   */
-  @Deprecated
-  public ArrayList<GazetteerEntry> geonamesFind(String searchString, int 
rowsReturned, String code) {
-    ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
-    searchString = cleanInput(searchString);
-    if (searchString.isEmpty()) {
-      return linkedData;
-    }
-    try {
-      /**
-       * build the search string Sometimes no country context is found. In this
-       * case the code variable will be an empty string
-       */
-      String luceneQueryString = !code.equals("")
-              ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND 
CC1:" + code.toLowerCase()//+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + 
code.toLowerCase() + "\"]"
-              : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
-      /**
-       * check the cache and go no further if the records already exist
-       */
-      ArrayList<GazetteerEntry> get = 
GazetteerSearchCache.get(luceneQueryString);
-      if (get != null) {
-
-        return get;
-      }
-
-      QueryParser parser = new QueryParser(Version.LUCENE_48, 
luceneQueryString, geonamesAnalyzer);
-      Query q = parser.parse(luceneQueryString);
-
-      TopDocs search = geonamesSearcher.search(q, rowsReturned);
-
-      for (int i = 0; i < search.scoreDocs.length; ++i) {
-        GazetteerEntry entry = new GazetteerEntry();
-        int docId = search.scoreDocs[i].doc;
-        double sc = search.scoreDocs[i].score;
-
-        entry.getScoreMap().put("lucene", sc);
-        entry.setIndexID(docId + "");
-        entry.setSource("geonames");
-
-        Document d = geonamesSearcher.doc(docId);
-        List<IndexableField> fields = d.getFields();
-        for (int idx = 0; idx < fields.size(); idx++) {
-          String value = d.get(fields.get(idx).name());
-          value = value.toLowerCase();
-          /**
-           * these positions map to the required fields in the gaz TODO: allow 
a
-           * configurable list of columns that map to the GazateerEntry fields,
-           * then users would be able to plug in any gazateer they have (if 
they
-           * build a lucene index out of it)
-           */
-          switch (idx) {
-            case 1:
-              entry.setItemID(value);
-              break;
-            case 3:
-              entry.setLatitude(Double.valueOf(value));
-              break;
-            case 4:
-              entry.setLongitude(Double.valueOf(value));
-              break;
-            case 10:
-              entry.setItemType(value);
-              break;
-            case 12:
-              entry.setItemParentID(value);
-              if (!value.toLowerCase().equals(code.toLowerCase())) {
-                continue;
-              }
-              break;
-            case 23:
-              entry.setItemName(value);
-              break;
-          }
-          entry.getIndexData().put(fields.get(idx).name(), value);
-        }
-        /**
-         * norm the levenstein distance
-         */
-        Double normLev = Double.valueOf(searchString.length()) / 
Double.valueOf(entry.getItemName().length());
-        /**
-         * only want hits above the levenstein thresh
-         */
-        if (normLev.compareTo(scoreCutoff) >= 0) {
-          if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase()) 
|| code.toLowerCase().equals("")) {
-            entry.getScoreMap().put("normlucene", normLev);
-            //make sure we don't produce a duplicate
-            if (!linkedData.contains(entry)) {
-              linkedData.add(entry);
-              /**
-               * add the records to the cache for this query
-               */
-              GazetteerSearchCache.put(luceneQueryString, linkedData);
-            }
-          }
-        }
-      }
-
-    } catch (IOException | ParseException ex) {
-      LOGGER.error(ex);
-    }
-
-    return linkedData;
-  }
-
-  /**
-   * Looks up the name in the USGS gazateer, after checking the cache
-   *
-   * @param searchString the nameed entity to look up in the lucene index
-   * @param rowsReturned how many rows to allow lucene to return
-   *
-   * @return
-   */
-  @Deprecated
-  public ArrayList<GazetteerEntry> usgsFind(String searchString, int 
rowsReturned) {
-    ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
-    searchString = cleanInput(searchString);
-    if (searchString.isEmpty()) {
-      return linkedData;
-    }
-    String luceneQueryString = "FEATURE_NAME:" + 
searchString.toLowerCase().trim() + " OR MAP_NAME: " + 
searchString.toLowerCase().trim();
-    try {
-
-      /**
-       * hit the cache
-       */
-      ArrayList<GazetteerEntry> get = 
GazetteerSearchCache.get(luceneQueryString);
-      if (get != null) {
-        //if the name is already there, return the list of cavhed results
-        return get;
-      }
-      QueryParser parser = new QueryParser(Version.LUCENE_48, 
luceneQueryString, usgsAnalyzer);
-      Query q = parser.parse(luceneQueryString);
-
-      TopDocs search = usgsSearcher.search(q, rowsReturned);
-      for (int i = 0; i < search.scoreDocs.length; i++) {
-        GazetteerEntry entry = new GazetteerEntry();
-        int docId = search.scoreDocs[i].doc;
-        double sc = search.scoreDocs[i].score;
-        //keep track of the min score for normalization
-
-        entry.getScoreMap().put("lucene", sc);
-        entry.setIndexID(docId + "");
-        entry.setSource("usgs");
-        entry.setItemParentID("us");
-        Document d = usgsSearcher.doc(docId);
-        List<IndexableField> fields = d.getFields();
-        for (int idx = 0; idx < fields.size(); idx++) {
-          String value = d.get(fields.get(idx).name());
-          value = value.toLowerCase();
-          switch (idx) {
-            case 0:
-              entry.setItemID(value);
-              break;
-            case 1:
-              entry.setItemName(value);
-              break;
-            case 2:
-              entry.setItemType(value);
-              break;
-            case 9:
-              entry.setLatitude(Double.valueOf(value));
-              break;
-            case 10:
-              entry.setLongitude(Double.valueOf(value));
-              break;
-          }
-          entry.getIndexData().put(fields.get(idx).name(), value);
-        }
-        /**
-         * norm the levenstein distance
-         */
-        Double normLev = Double.valueOf(searchString.length()) / 
Double.valueOf(entry.getItemName().length());
-        /**
-         * only want hits above the levenstein thresh
-         */
-        if (normLev.compareTo(scoreCutoff) >= 0) {
-          //only keep it if the country code is a match. even when the code is 
passed in as a weighted condition, there is no == equiv in lucene
-
+        // if (normLev > scoreCutoff) {
+        if 
(entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || 
parentid.toLowerCase().equals("")) {
           entry.getScoreMap().put("normlucene", normLev);
           //make sure we don't produce a duplicate
           if (!linkedData.contains(entry)) {
@@ -387,16 +182,18 @@ public class GazetteerSearcher {
             /**
              * add the records to the cache for this query
              */
-            GazetteerSearchCache.put(luceneQueryString, linkedData);
+            GazetteerSearchCache.put(placeNameQueryString, linkedData);
           }
         }
-
+        //}
       }
 
     } catch (IOException | ParseException ex) {
       LOGGER.error(ex);
     }
 
+  
+
     return linkedData;
   }
 
@@ -431,7 +228,17 @@ public class GazetteerSearcher {
       opennlpReader = DirectoryReader.open(opennlpIndex);
       opennlpSearcher = new IndexSearcher(opennlpReader);
       //TODO: a language code switch statement should be employed here at some 
point
-      opennlpAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new 
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+      opennlpAnalyzer
+              = //new StandardAnalyzer(Version.LUCENE_48, new 
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+              new StandardAnalyzer(Version.LUCENE_48, new 
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+      Map<String, Analyzer> analyMap = new HashMap<>();
+
+      analyMap.put("countrycode", new KeywordAnalyzer());
+      analyMap.put("admincode", new KeywordAnalyzer());
+      analyMap.put("loctype", new KeywordAnalyzer());
+      analyMap.put("countycode", new KeywordAnalyzer());
+      analyMap.put("gazsource", new KeywordAnalyzer());
+
       String cutoff = 
properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", 
String.valueOf(scoreCutoff));
       String usehierarchy = 
properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", 
String.valueOf("0"));
       if (cutoff != null && !cutoff.isEmpty()) {

Modified: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1618251&r1=1618250&r2=1618251&view=diff
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
 (original)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
 Fri Aug 15 18:10:51 2014
@@ -22,9 +22,11 @@ import opennlp.addons.geoentitylinker.sc
 import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer;
 import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer;
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer;
 import opennlp.tools.entitylinker.BaseLink;
 import opennlp.tools.entitylinker.LinkedSpan;
 import opennlp.tools.util.Span;
@@ -39,8 +41,8 @@ import opennlp.tools.entitylinker.Entity
  */
 public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
 
+  private static Integer topN = 2;
   private AdminBoundaryContextGenerator countryContext;
-  private Map<String, Set<Integer>> countryMentions;
   private EntityLinkerProperties linkerProperties;
   private GazetteerSearcher gazateerSearcher;
   private List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new 
ArrayList<>();
@@ -64,10 +66,10 @@ public class GeoEntityLinker implements 
         ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
         if (!context.getWhereClauses().isEmpty()) {
           for (String whereclause : context.getWhereClauses()) {
-            geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, 
whereclause));
+            geoNamesEntries.addAll(gazateerSearcher.find(matches[i], topN, 
whereclause));
           }
         } else {//this means there were no where clauses generated so the 
where clause will default to look at the entire index
-          geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, " 
gaztype:usgs geonames regions "));
+          geoNamesEntries.addAll(gazateerSearcher.find(matches[i], topN, " 
gaztype:usgs geonames regions "));
         }
         if (geoNamesEntries.isEmpty()) {
           continue;
@@ -86,6 +88,36 @@ public class GeoEntityLinker implements 
         scorer.score(spans, doctext, sentences, linkerProperties, context);
       }
     }
+    /**
+     * sort the data with the best score on top based on the sum of the scores 
below from the score map for each baselink object
+     */
+    for (LinkedSpan<BaseLink> s : spans) {
+      ArrayList<BaseLink> linkedData = s.getLinkedEntries();
+      Collections.sort(linkedData, Collections.reverseOrder(new 
Comparator<BaseLink>() {
+        @Override
+        public int compare(BaseLink o1, BaseLink o2) {
+          HashMap<String, Double> o1scoreMap = o1.getScoreMap();
+          HashMap<String, Double> o2scoreMap = o2.getScoreMap();
+          if (o1scoreMap.size() != o2scoreMap.size()) {
+            return 0;
+          }
+          double sumo1 = 0d;
+          double sumo2 = 0d;
+          for (String object : o1scoreMap.keySet()) {
+            if (object.equals("typescore")
+                    || object.equals("countrycontext")
+                    || object.equals("normlucene")
+                    || object.equals("geohashbin")) {
+              sumo1 += o1scoreMap.get(object);
+              sumo2 += o2scoreMap.get(object);
+            }
+          }
+
+          return Double.compare(sumo1,
+                  sumo2);
+        }
+      }));
+    }
 
     return spans;
   }
@@ -96,16 +128,25 @@ public class GeoEntityLinker implements 
       scorers.add(new CountryProximityScorer());
       scorers.add(new ModelBasedScorer());
       scorers.add(new FuzzyStringMatchScorer());
-      // scorers.add(new ProvinceProximityScorer());
+      scorers.add(new PlacetypeScorer());
     }
   }
 
+  
   @Override
   public void init(EntityLinkerProperties properties) throws IOException {
     try {
       this.linkerProperties = properties;
       countryContext = new 
AdminBoundaryContextGenerator(this.linkerProperties);
       gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
+      String rowsRetStr = 
this.linkerProperties.getProperty("opennlp.geoentitylinker.gaz.rowsreturned", 
"2");
+      Integer rws = 2;
+      try {
+        rws = Integer.valueOf(rowsRetStr);
+      } catch (NumberFormatException e) {
+        rws = 2;
+      }
+      topN = rws;
       loadScorers();
     } catch (Exception ex) {
       throw new IOException(ex);

Modified: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java?rev=1618251&r1=1618250&r2=1618251&view=diff
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
 (original)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
 Fri Aug 15 18:10:51 2014
@@ -18,8 +18,12 @@ package opennlp.addons.geoentitylinker.i
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.index.IndexWriter;
@@ -88,8 +92,8 @@ public class GazetteerIndexer {
   /**
    *
    * @param geonamesData the actual Geonames gazetteer data downloaded from
-   * here: http://download.geonames.org/export/dump/ then click on this
-   * link 'allCountries.zip'
+   * here: http://download.geonames.org/export/dump/ then click on this link
+   * 'allCountries.zip'
    * @param geoNamesCountryInfo the countryinfo lookup table that can be
    * downloaded from here
    * http://download.geonames.org/export/dump/countryinfo.txt
@@ -146,9 +150,19 @@ public class GazetteerIndexer {
 
     String indexloc = outputIndexDir.getPath() + 
"/opennlp_geoentitylinker_gazetteer";
     Directory index = new MMapDirectory(new File(indexloc));
-
     Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new 
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
-    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);
+    Map<String, Analyzer> analyMap = new HashMap<>();
+
+    analyMap.put("countrycode", new KeywordAnalyzer());
+    analyMap.put("admincode", new KeywordAnalyzer());
+    analyMap.put("loctype", new KeywordAnalyzer());
+    analyMap.put("countycode", new KeywordAnalyzer());
+    analyMap.put("gazsource", new KeywordAnalyzer());
+    
+    PerFieldAnalyzerWrapper aWrapper
+            = new PerFieldAnalyzerWrapper(a, analyMap);
+
+    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, 
aWrapper);
 
     IndexWriter w = new IndexWriter(index, config);
     USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, 
outputCountryContextFile, w);
@@ -161,67 +175,4 @@ public class GazetteerIndexer {
     System.out.println("\nIndexing complete. Be sure to add '" + indexloc + "' 
and context file '" + outputCountryContextFile.getPath() + "' to 
entitylinker.properties file");
   }
 
-  /**
-   * indexes the USGS or Geonames gazateers.
-   *
-   * @param outputIndexDir a DIRECTORY path where you would like to store the
-   * output lucene indexes
-   * @param gazetteerInputData the file, "as is" that was downloaded from the
-   * USGS and GEONAMES website
-   * @param type indicates whether the data is USGS or GEONAMES format
-   * @throws Exception
-   */
-  @Deprecated
-  public void index(File outputIndexDir, File gazetteerInputData, GazType 
type) throws Exception {
-    if (!outputIndexDir.isDirectory()) {
-      throw new IllegalArgumentException("outputIndexDir must be a 
directory.");
-
-    }
-
-    String indexloc = outputIndexDir + type.toString();
-    Directory index = new MMapDirectory(new File(indexloc));
-
-    Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new 
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
-    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);
-
-    IndexWriter w = new IndexWriter(index, config);
-    //  GeonamesProcessor.process(new 
File("C:\\temp\\gazetteers\\geonamesdata\\countrycodes.txt"), new 
File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"), 
gazetteerInputData, null, w);
-    // USGSProcessor.process(gazetteerInputData, outputIndexDir, w);
-    //  readFile(gazetteerInputData, w, type);
-    w.commit();
-    w.close();
-
-  }
-//
-//  public void readFile(File gazateerInputData, IndexWriter w, GazType type) 
throws Exception {
-//    BufferedReader reader = new BufferedReader(new 
FileReader(gazateerInputData));
-//    List<String> fields = new ArrayList<>();
-//    int counter = 0;
-//    System.out.println("reading gazetteer data from file...........");
-//    while (reader.read() != -1) {
-//      String line = reader.readLine();
-//      String[] values = line.split(type.getSeparator());
-//      if (counter == 0) {
-//        for (String columnName : values) {
-//          fields.add(columnName.replace("»¿", "").trim());
-//        }
-//
-//      } else {
-//        Document doc = new Document();
-//        for (int i = 0; i < fields.size() - 1; i++) {
-//          doc.add(new TextField(fields.get(i), values[i].trim(), 
Field.Store.YES));
-//        }
-//        w.addDocument(doc);
-//      }
-//      counter++;
-//      if (counter % 100000 == 0) {
-//        w.commit();
-//        System.out.println(counter + " .........committed to 
index..............");
-//      }
-//
-//    }
-//    w.commit();
-//    System.out.println("Completed indexing gaz! index name is: " + 
type.toString());
-//  }
-
 }

Modified: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java?rev=1618251&r1=1618250&r2=1618251&view=diff
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
 (original)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
 Fri Aug 15 18:10:51 2014
@@ -29,6 +29,7 @@ import java.util.Set;
 import opennlp.addons.geoentitylinker.AdminBoundary;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.IndexWriter;
 
@@ -230,13 +231,17 @@ public class GeonamesProcessor {
       String lat = values[4];
       String lon = values[5];
       String dsg = values[7].toLowerCase();
+
       String id = values[0];
       String concatIndexEntry = "";
+      String countryname = "";
       if (adm != null) {
         concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() 
+ ", " + placeName;
+        countryname = adm.getCountryName();
       } else {
         //there is no admin info, but we can still use the countrycode to 
concat the country name
         String n = countrycodes.get(ccode);
+        countryname = n;
         if (n != null) {
           concatIndexEntry = n + ", " + placeName;
         } else {
@@ -251,27 +256,30 @@ public class GeonamesProcessor {
         doc.add(new TextField(fields.get(i), values[i].trim(), 
Field.Store.YES));
 
       }
-
+      if (dsg.equals("pcli")) {
+        System.out.println("placename: " + placeName + " RESET TO: " + 
countryname);
+        placeName = countryname;
+      }
       /**
        * add standard fields to the index
        */
       doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));
       doc.add(new TextField("placename", placeName, Field.Store.YES));
+      // doc.add(new TextField("countryname", countryname, Field.Store.YES));
+      //System.out.println(placeName);
+
       doc.add(new TextField("latitude", lat, Field.Store.YES));
       doc.add(new TextField("longitude", lon, Field.Store.YES));
-      if (boostMap.containsKey(dsg)) {
-        TextField f = new TextField("loctype", dsg, Field.Store.YES);
-        f.setBoost(boostMap.get(dsg));
-        doc.add(f);
-      } else {
-        doc.add(new TextField("loctype", dsg, Field.Store.YES));
+      doc.add(new StringField("loctype", dsg, Field.Store.YES));
+      doc.add(new StringField("admincode", (ccode + "." + 
admincode).toLowerCase(), Field.Store.YES));
+      doc.add(new StringField("countrycode", ccode.toLowerCase(), 
Field.Store.YES));
+      doc.add(new StringField("countycode", "", Field.Store.YES));
+      doc.add(new StringField("locid", id, Field.Store.YES));
+      placeName = placeName.replace("republic of", "").replace("federative", 
"");
+      if (id.equals("3175395")) {
+        System.out.println(placeName);
       }
-      doc.add(new TextField("admincode", (ccode + "." + 
admincode).toLowerCase(), Field.Store.YES));
-      doc.add(new TextField("countrycode", ccode.toLowerCase(), 
Field.Store.YES));
-      doc.add(new TextField("countycode", "", Field.Store.YES));
-
-      doc.add(new TextField("locid", id, Field.Store.YES));
-      doc.add(new TextField("gazsource", "geonames", Field.Store.YES));
+      doc.add(new StringField("gazsource", "geonames", Field.Store.YES));
 
       w.addDocument(doc);
 

Modified: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java?rev=1618251&r1=1618250&r2=1618251&view=diff
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
 (original)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
 Fri Aug 15 18:10:51 2014
@@ -23,6 +23,7 @@ import java.util.ArrayList;
 import java.util.List;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.IndexWriter;
 
@@ -80,15 +81,15 @@ public class RegionProcessor {
 
         doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
         doc.add(new TextField("placename", placeName, Field.Store.YES));
-        doc.add(new TextField("latitude", lat, Field.Store.YES));
-        doc.add(new TextField("longitude", lon, Field.Store.YES));
-        doc.add(new TextField("loctype", dsg, Field.Store.YES));
-        doc.add(new TextField("admincode", "", Field.Store.YES));
-        doc.add(new TextField("countrycode", id, Field.Store.YES));
-        doc.add(new TextField("countycode", "", Field.Store.YES));
+        doc.add(new StringField("latitude", lat, Field.Store.YES));
+        doc.add(new StringField("longitude", lon, Field.Store.YES));
+        doc.add(new StringField("loctype", dsg, Field.Store.YES));
+        doc.add(new StringField("admincode", "", Field.Store.YES));
+        doc.add(new StringField("countrycode", id, Field.Store.YES));
+        doc.add(new StringField("countycode", "", Field.Store.YES));
 
-        doc.add(new TextField("locid", id, Field.Store.YES));
-        doc.add(new TextField("gazsource", "region", Field.Store.YES));
+        doc.add(new StringField("locid", id, Field.Store.YES));
+        doc.add(new StringField("gazsource", "region", Field.Store.YES));
         //countrycontext file format
         // US  KY      131     United States   Kentucky        Leslie
 

Modified: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java?rev=1618251&r1=1618250&r2=1618251&view=diff
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
 (original)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
 Fri Aug 15 18:10:51 2014
@@ -29,6 +29,7 @@ import java.util.logging.Logger;
 import opennlp.addons.geoentitylinker.AdminBoundary;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 
 import org.apache.lucene.index.IndexWriter;
@@ -86,24 +87,24 @@ public class USGSProcessor {
         String countyname = "";
         String countyCode = get.getCountyCode();
         if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {
-          countyname =  get.getCountyName();
+          countyname = get.getCountyName();
         }
         if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) {
           countyCode = get.getCountyCode();
         }
-        String hierarchy = get.getCountryName() + ", " + get.getProvinceName() 
+", "+ countyname + ", " + placeName;
-
+        String hierarchy = get.getCountryName() + ", " + get.getProvinceName() 
+ ", " + countyname + ", " + placeName;
+       // doc.add(new TextField("countryname", "united states", 
Field.Store.YES));
         doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
         doc.add(new TextField("placename", placeName, Field.Store.YES));
         doc.add(new TextField("latitude", lat, Field.Store.YES));
         doc.add(new TextField("longitude", lon, Field.Store.YES));
-        doc.add(new TextField("loctype", dsg, Field.Store.YES));
-        doc.add(new TextField("admincode", (get.getCountryCode() + "." + 
get.getProvCode()).toLowerCase(), Field.Store.YES));
-        doc.add(new TextField("countrycode", 
get.getCountryCode().toLowerCase(), Field.Store.YES));
-        doc.add(new TextField("countycode", (get.getCountryCode() + "." + 
get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES));
+        doc.add(new StringField("loctype", dsg, Field.Store.YES));
+        doc.add(new StringField("admincode", (get.getCountryCode() + "." + 
get.getProvCode()).toLowerCase(), Field.Store.YES));
+        doc.add(new StringField("countrycode", 
get.getCountryCode().toLowerCase(), Field.Store.YES));
+        doc.add(new StringField("countycode", (get.getCountryCode() + "." + 
get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES));
 
-        doc.add(new TextField("locid", id, Field.Store.YES));
-        doc.add(new TextField("gazsource", "usgs", Field.Store.YES));
+        doc.add(new StringField("locid", id, Field.Store.YES));
+        doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
         w.addDocument(doc);
       }
       counter++;
@@ -118,7 +119,7 @@ public class USGSProcessor {
   }
 
   private static Map<String, AdminBoundary> getProvData(File govUnitsFile, 
GazetteerIndexer.GazType type) {
- System.out.println("Attempting to read USGS province (State) data from: " + 
govUnitsFile.getPath());
+    System.out.println("Attempting to read USGS province (State) data from: " 
+ govUnitsFile.getPath());
     Map<String, AdminBoundary> outmap = new HashMap<>();
     BufferedReader reader;
 
@@ -153,7 +154,7 @@ public class USGSProcessor {
     } catch (IOException ex) {
       ex.printStackTrace();
     }
-  System.out.println("Successfully read USGS province (State) data from: " + 
govUnitsFile.getPath());
+    System.out.println("Successfully read USGS province (State) data from: " + 
govUnitsFile.getPath());
 
     return outmap;
 
@@ -176,7 +177,7 @@ public class USGSProcessor {
          */
         String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + 
adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + 
adm.getCountyName() + "\n";
         writer.write(line);
-      ///  System.out.println(line);
+        ///  System.out.println(line);
 
       }
     } catch (IOException ex) {

Added: 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
URL: 
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java?rev=1618251&view=auto
==============================================================================
--- 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
 (added)
+++ 
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
 Fri Aug 15 18:10:51 2014
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.scoring;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class PlacetypeScorer implements 
LinkedEntityScorer<AdminBoundaryContext> {
+
+  private static final String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H 
ADM4 ADM4H ADM5 ADMD ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 
PPLA4 PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT".split(" ");
+  private Map<String, Double> boosetedTypes = new HashMap<>();
+
+  public PlacetypeScorer() {
+    fillMap();
+  }
+
+  @Override
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext 
additionalContext) {
+    for(LinkedSpan<GazetteerEntry> geospan : linkedSpans){
+      ArrayList<GazetteerEntry> linkedEntries = geospan.getLinkedEntries();
+      for (GazetteerEntry gazetteerEntry : linkedEntries) {
+        String type = gazetteerEntry.getItemType().toLowerCase();
+        Double score = getScore(type);
+        if(score==null){
+          score = 0d;
+        }
+        gazetteerEntry.getScoreMap().put("typescore", score);
+      }
+    }
+  }
+
+  private Double getScore(String type) {
+    Double ret = boosetedTypes.get(type.toLowerCase());
+    return ret == null ? 0d : ret;
+  }
+
+  private void fillMap() {
+    if (boosetedTypes.isEmpty()) {
+      for (String type : boosts) {
+        if (type.equals("PCLI")) {
+          boosetedTypes.put(type.toLowerCase(), 1d);
+        } else if (type.startsWith("PCL") && !type.equals("PCLI")) {
+          boosetedTypes.put(type.toLowerCase(), .5d);
+        } else if (type.startsWith("ADM")) {
+          boosetedTypes.put(type.toLowerCase(), .75d);
+        }
+      }
+    }
+  }
+
+}


Reply via email to