Re: svn commit: r1533959 - in /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: entitylinker/ entitylinker/domain/ ngram/

Jörn Kottmann Tue, 22 Oct 2013 10:48:39 -0700

You added the NGramGenerator class to the ngrams package,
but we already have a a class the NGramModel to create ngrams.

Would it be possible for you to use that one instead, so we avoidduplication?


Jörn

On 10/20/2013 10:04 PM, ma...@apache.org wrote:

Author: markg
Date: Sun Oct 20 20:04:41 2013
New Revision: 1533959

URL: http://svn.apache.org/r1533959
Log:
OPENNLP-579
GeoEntityLinkerImpl: Implemented better scoring using Dice coefficient of 
bigram, as well as highly improved scoring based on country context. Created an 
NgramGenerator class and a FuzzyStringMatching class, assuming they would be 
useful for other linker impls. Implemented Regex based discovery of 
countrycontext, which enabled proximity based analysis of doctext
Multiple other small efficiencies in the GeoEntityLinker

Added:
     
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/FuzzyStringMatcher.java
     
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityScorer.java
     
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramGenerator.java
Modified:
     
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/CountryContext.java
     
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityLinker.java
     
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazEntry.java
     
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazLinkable.java
     
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazEntry.java
     
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazLinkable.java
     
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/domain/BaseLink.java

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/CountryContext.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/CountryContext.java?rev=1533959&r1=1533958&r2=1533959&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/CountryContext.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/CountryContext.java
 Sun Oct 20 20:04:41 2013
@@ -21,23 +21,42 @@ import java.sql.DriverManager;
  import java.sql.ResultSet;
  import java.sql.SQLException;
  import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
  import java.util.List;
+import java.util.Map;
+import java.util.Set;
  import java.util.logging.Level;
  import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;

/**

- *Finds instances of country mentions in a String, typically a document text.
+ * Finds instances of country mentions in a String, typically a document text.
   * Used to boost or degrade scoring of linked geo entities
-
+ *
   */
  public class CountryContext {

private Connection con;

    private List<CountryContextEntry> countrydata;
+  private Map<String, Set<String>> nameCodesMap = new HashMap<String, 
Set<String>>();
+
+  public Map<String, Set<String>> getNameCodesMap() {
+    return nameCodesMap;
+  }
+
+  public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {
+    this.nameCodesMap = nameCodesMap;
+  }

public CountryContext() {

+ /**

+   * use regexFind
+   */
+  @Deprecated
    public List<CountryContextHit> find(String docText, EntityLinkerProperties 
properties) {
      List<CountryContextHit> hits = new ArrayList<CountryContextHit>();
      try {
@@ -51,7 +70,7 @@ public class CountryContext {

if (docText.contains(entry.getFull_name_nd_ro())) {

            System.out.println("\tFound Country indicator: " + 
entry.getFull_name_nd_ro());
-          CountryContextHit hit = new CountryContextHit(entry.getCc1(), 
docText.indexOf(entry.getFull_name_nd_ro()), 
docText.indexOf(entry.getFull_name_nd_ro()+ 
entry.getFull_name_nd_ro().length()));
+          CountryContextHit hit = new CountryContextHit(entry.getCc1(), 
docText.indexOf(entry.getFull_name_nd_ro()), 
docText.indexOf(entry.getFull_name_nd_ro() + 
entry.getFull_name_nd_ro().length()));
            hits.add(hit);
          }
        }
@@ -60,6 +79,81 @@ public class CountryContext {
        Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, 
null, ex);
      }
      return hits;
+
+  }
+/**
+ * Finds mentions of countries based on a list from MySQL stored procedure 
called getCountryList. This method finds country mentions in documents,
+ * which is an essential element of the scoring that is done for geo 
linkedspans. Lazily loads the list from the database.
+ * @param docText the full text of the document
+ * @param properties EntityLinkerProperties for getting database connection
+ * @return
+ */
+  public Map<String, Set<Integer>> regexfind(String docText, 
EntityLinkerProperties properties) {
+    Map<String, Set<Integer>> hits = new HashMap<String, Set<Integer>>();
+    try {
+      if (con == null) {
+        con = getMySqlConnection(properties);
+      }
+      if (countrydata == null) {
+        countrydata = getCountryData(properties);
+      }
+      for (CountryContextEntry entry : countrydata) {
+        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro(), 
Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
+        Matcher rs = regex.matcher(docText);
+        String code = entry.getCc1().toLowerCase();
+        while (rs.find()) {
+          Integer start = rs.start();
+          String hit = rs.group().toLowerCase();
+          if (hits.containsKey(code)) {
+            hits.get(code).add(start);
+          } else {
+            Set<Integer> newset = new HashSet<Integer>();
+            newset.add(start);
+            hits.put(code, newset);
+          }
+          if (!hit.equals("")) {
+            if (this.nameCodesMap.containsKey(hit)) {
+              nameCodesMap.get(hit).add(code);
+            } else {
+              HashSet<String> newset = new HashSet<String>();
+              newset.add(code);
+              nameCodesMap.put(hit, newset);
+            }
+          }
+        }
+
+      }
+
+    } catch (Exception ex) {
+      Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, null, 
ex);
+    }
+
+    //System.out.println(hits);
+    return hits;
+  }
+/**
+ * returns a unique list of country codes
+ * @param hits the hits discovered
+ * @return
+ */
+  public static Set<String> getCountryCodes(List<CountryContextHit> hits) {
+    Set<String> ccs = new HashSet<String>();
+    for (CountryContextHit hit : hits) {
+      ccs.add(hit.getCountryCode().toLowerCase());
+    }
+    return ccs;
+  }
+
+  public static String getCountryCodeCSV(Set<String> hits) {
+    String csv = "";
+    if (hits.isEmpty()) {
+      return csv;
+    }
+
+    for (String code : hits) {
+      csv += "," + code;
+    }
+    return csv.substring(1);
    }

private Connection getMySqlConnection(EntityLinkerProperties properties) throws Exception {

@@ -73,7 +167,12 @@ public class CountryContext {
      Connection conn = DriverManager.getConnection(url, username, password);
      return conn;
    }
-
+/**
+ * reads the list from the database by calling a stored procedure 
getCountryList
+ * @param properties
+ * @return
+ * @throws SQLException
+ */
    private List<CountryContextEntry> getCountryData(EntityLinkerProperties 
properties) throws SQLException {
      List<CountryContextEntry> entries = new ArrayList<CountryContextEntry>();
      try {

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/FuzzyStringMatcher.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/FuzzyStringMatcher.java?rev=1533959&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/FuzzyStringMatcher.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/FuzzyStringMatcher.java
 Sun Oct 20 20:04:41 2013
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.entitylinker;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import opennlp.tools.ngram.NGramGenerator;
+
+/**
+ *
+ *Generates scores for string comparisons.
+ */
+public class FuzzyStringMatcher {
+/**
+ * Generates a score based on an overlap of nGrams between two strings using 
the DiceCoefficient technique.
+ *
+ * @param s1 first string
+ * @param s2 second string
+ * @param nGrams number of chars in each gram
+ * @return
+ */
+  public static double getDiceCoefficient(String s1, String s2, int nGrams) {
+    if (s1.equals("") || s1.equals("")) {
+      return 0d;
+    }
+    List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, 
"");
+    List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, 
"");
+
+    Set<String> overlap = new HashSet<String>(s1Grams);
+    overlap.retainAll(s2Grams);
+    double totcombigrams = overlap.size();
+
+    return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());
+  }
+}

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityLinker.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityLinker.java?rev=1533959&r1=1533958&r2=1533959&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityLinker.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityLinker.java
 Sun Oct 20 20:04:41 2013
@@ -19,6 +19,8 @@ import java.io.File;
  import java.io.IOException;
  import java.util.ArrayList;
  import java.util.List;
+import java.util.Map;
+import java.util.Set;
  import java.util.logging.Level;
  import java.util.logging.Logger;
  import opennlp.tools.entitylinker.domain.BaseLink;
@@ -26,17 +28,24 @@ import opennlp.tools.entitylinker.domain
  import opennlp.tools.util.Span;

/**

- * Links location entities to gazatteers.
+ * Links location entities to gazatteers. Currently supports gazateers in a
+ * MySql database (NGA and USGS)
   *
   *
   */
  public class GeoEntityLinker implements EntityLinker<LinkedSpan> {

+ GeoEntityScorer scorer = new GeoEntityScorer();

    private MySQLGeoNamesGazLinkable geoNamesGaz;// = new 
MySQLGeoNamesGazLinkable();
    private MySQLUSGSGazLinkable usgsGaz;//= new MySQLUSGSGazLinkable();
    private CountryContext countryContext;
-  private List<CountryContextHit> hits;
-  private EntityLinkerProperties props;
+  private Map<String, Set<Integer>> countryMentions;
+  private EntityLinkerProperties linkerProperties;
+  /**
+   * Flag for deciding whether to search gaz only for toponyms within countries
+   * that are mentioned in the document
+   */
+  private Boolean filterCountryContext=true;

public GeoEntityLinker() {

      if (geoNamesGaz == null || usgsGaz == null) {
@@ -50,25 +59,44 @@ public class GeoEntityLinker implements
    public List<LinkedSpan> find(String text, Span[] sentences, String[] 
tokens, Span[] names) {
      ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
      try {
-      if (props == null) {
-        props = new EntityLinkerProperties(new 
File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
+      if (linkerProperties == null) {
+        linkerProperties = new EntityLinkerProperties(new 
File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
        }
-      if (hits == null) {
-        System.out.println("getting country context");
-        hits = countryContext.find(text, props);
-      }
-
+
+        countryMentions = countryContext.regexfind(text, linkerProperties);
+
+      //prioritize query
+      filterCountryContext = 
Boolean.valueOf(linkerProperties.getProperty("geoentitylinker.filter_by_country_context", 
"true"));
        String[] matches = Span.spansToStrings(names, tokens);
        for (int i = 0; i < matches.length; i++) {
-        System.out.println("processing match " + i + " of " + matches.length);
-        ArrayList<BaseLink> geoNamesEntries = geoNamesGaz.find(matches[i], 
names[i], hits, props);
-        ArrayList<BaseLink> usgsEntries = usgsGaz.find(matches[i], names[i], 
hits, props);
-        LinkedSpan<BaseLink> geoSpans = new 
LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
-        geoSpans.getLinkedEntries().addAll(usgsEntries);
-        geoSpans.setSearchTerm(matches[i]);
-        spans.add(geoSpans);
+
+//nga gazateer is for other than US placenames, don't use it unless US is a 
mention in the document
+        ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
+        if (!(countryMentions.keySet().contains("us") && 
countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1) {
+          geoNamesEntries = geoNamesGaz.find(matches[i], names[i], 
countryMentions, linkerProperties);
+        }
+        ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();
+        if (countryMentions.keySet().contains("us")) {
+          usgsEntries = usgsGaz.find(matches[i], names[i], countryMentions, 
linkerProperties);
+        }
+        LinkedSpan<BaseLink> geoSpan = new 
LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
+
+        if (!usgsEntries.isEmpty()) {
+          geoSpan.getLinkedEntries().addAll(usgsEntries);
+          geoSpan.setSearchTerm(matches[i]);
+        }
+
+        if (!geoSpan.getLinkedEntries().isEmpty()) {
+          geoSpan.setSearchTerm(matches[i]);
+          spans.add(geoSpan);
+        }
+
        }
-      return spans;
+      //score the spans
+
+      scorer.score(spans, countryMentions, countryContext.getNameCodesMap(), 
text, sentences, 1000);
+
+      //  return spans;
      } catch (IOException ex) {
        Logger.getLogger(GeoEntityLinker.class.getName()).log(Level.SEVERE, 
null, ex);
      }
@@ -78,12 +106,14 @@ public class GeoEntityLinker implements
    public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, 
Span[] names) {
      ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
      try {
-
-
-      if (props == null) {
-        props = new EntityLinkerProperties(new 
File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
+      if (linkerProperties == null) {
+        linkerProperties = new EntityLinkerProperties(new 
File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
        }
-      List<CountryContextHit> hits = countryContext.find(text, props);
+
+        //  System.out.println("getting country context");
+        //hits = countryContext.find(text, linkerProperties);
+        countryMentions = countryContext.regexfind(text, linkerProperties);
+
        //get the sentence text....must assume some index
        Span s = sentences[0];
        String sentence = text.substring(s.getStart(), s.getEnd());
@@ -92,17 +122,32 @@ public class GeoEntityLinker implements
        //get the names based on the tokens
        String[] matches = Span.spansToStrings(names, stringtokens);
        for (int i = 0; i < matches.length; i++) {
-        ArrayList<BaseLink> geoNamesEntries = geoNamesGaz.find(matches[i], 
names[i], hits, props);
-        ArrayList<BaseLink> usgsEntries = usgsGaz.find(matches[i], names[i], 
hits, props);
-        LinkedSpan<BaseLink> geoSpans = new 
LinkedSpan<BaseLink>(geoNamesEntries, names[i], 0);
-        geoSpans.getLinkedEntries().addAll(usgsEntries);
-        geoSpans.setSearchTerm(matches[i]);
-        spans.add(geoSpans);
+        //nga gazateer is for other than US placenames, don't use it unless US 
is a mention in the document
+        ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
+        if (!(countryMentions.keySet().contains("us") && 
countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1) {
+          geoNamesEntries = geoNamesGaz.find(matches[i], names[i], 
countryMentions, linkerProperties);
+        }
+        ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();
+        if (countryMentions.keySet().contains("us")) {
+          usgsEntries = usgsGaz.find(matches[i], names[i], countryMentions, 
linkerProperties);
+        }
+        LinkedSpan<BaseLink> geoSpan = new 
LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
+
+        if (!usgsEntries.isEmpty()) {
+          geoSpan.getLinkedEntries().addAll(usgsEntries);
+          geoSpan.setSearchTerm(matches[i]);
+        }
+
+        if (!geoSpan.getLinkedEntries().isEmpty()) {
+          geoSpan.setSearchTerm(matches[i]);
+          spans.add(geoSpan);
+        }
        }
-      return spans;
+
      } catch (IOException ex) {
        Logger.getLogger(GeoEntityLinker.class.getName()).log(Level.SEVERE, 
null, ex);
      }
+    scorer.score(spans, countryMentions, countryContext.getNameCodesMap(), 
text, sentences, 1000);
      return spans;
    }

@@ -110,10 +155,11 @@ public class GeoEntityLinker implements

      ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
      try {

- if (props == null) {

-        props = new EntityLinkerProperties(new 
File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
+      if (linkerProperties == null) {
+        linkerProperties = new EntityLinkerProperties(new 
File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
        }
-      List<CountryContextHit> hits = countryContext.find(text, props);
+
+      countryMentions = countryContext.regexfind(text, linkerProperties);

Span s = sentences[sentenceIndex];

        String sentence = text.substring(s.getStart(), s.getEnd());
@@ -123,15 +169,29 @@ public class GeoEntityLinker implements
        String[] matches = Span.spansToStrings(names, stringtokens);

for (int i = 0; i < matches.length; i++) {

-        ArrayList<BaseLink> geoNamesEntries = geoNamesGaz.find(matches[i], 
names[i], hits, props);
-        ArrayList<BaseLink> usgsEntries = usgsGaz.find(matches[i], names[i], 
hits, props);
-        LinkedSpan<BaseLink> geoSpans = new 
LinkedSpan<BaseLink>(geoNamesEntries, names[i], 0);
-        geoSpans.getLinkedEntries().addAll(usgsEntries);
-        geoSpans.setSearchTerm(matches[i]);
-        geoSpans.setSentenceid(sentenceIndex);
-        spans.add(geoSpans);
+//nga gazateer is for other than US placenames, don't use it unless US is a 
mention in the document
+        ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
+        if (!(countryMentions.keySet().contains("us") && 
countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1) {
+          geoNamesEntries = geoNamesGaz.find(matches[i], names[i], 
countryMentions, linkerProperties);
+        }
+        ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();
+        if (countryMentions.keySet().contains("us")) {
+          usgsEntries = usgsGaz.find(matches[i], names[i], countryMentions, 
linkerProperties);
+        }
+        LinkedSpan<BaseLink> geoSpan = new 
LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
+
+        if (!usgsEntries.isEmpty()) {
+          geoSpan.getLinkedEntries().addAll(usgsEntries);
+          geoSpan.setSearchTerm(matches[i]);
+        }
+
+        if (!geoSpan.getLinkedEntries().isEmpty()) {
+          geoSpan.setSearchTerm(matches[i]);
+          geoSpan.setSentenceid(sentenceIndex);
+          spans.add(geoSpan);
+        }
        }
-
+      scorer.score(spans, countryMentions, countryContext.getNameCodesMap(), 
text, sentences, 2000);
      } catch (IOException ex) {
        Logger.getLogger(GeoEntityLinker.class.getName()).log(Level.SEVERE, 
null, ex);
      }
@@ -139,6 +199,6 @@ public class GeoEntityLinker implements
    }

public void setEntityLinkerProperties(EntityLinkerProperties properties) {

-    this.props = properties;
+    this.linkerProperties = properties;
    }
  }

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityScorer.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityScorer.java?rev=1533959&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityScorer.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityScorer.java
 Sun Oct 20 20:04:41 2013
@@ -0,0 +1,256 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.entitylinker;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Scores toponyms based on country context as well as fuzzy string matching
+ */
+public class GeoEntityScorer {
+
+  private Map<String, Set<String>> nameCodesMap;
+  String dominantCode = "";
+
+  /**
+   * Assigns a score to each BaseLink in each linkedSpan's set of N best
+   * matches. Currently the scoring indicates the probability that the toponym
+   * is correct based on the country context in the document and fuzzy string 
matching
+   *
+   * @param linkedData     the linked spans, holds the Namefinder results, and
+   *                       the list of BaseLink for each
+   * @param countryHits    all the country mentions in the document
+   * @param nameCodesMap   maps a country indicator name to a country code. 
Used
+   *                       to determine if the namefinder found the same exact
+   *                       toponym the country context did. If so the score is
+   *                       boosted due to the high probability that the
+   *                       NameFinder actually "rediscovered" a country
+   * @param docText        the full text of the document...not used in this
+   *                       default implementation
+   * @param sentences      the sentences that correspond to the doc text.
+   * @param maxAllowedDist a constant that is used to determine which country
+   *                       mentions, based on proximity within the text, should
+   *                       be used to score the Named Entity.
+   * @return
+   */
+  public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> 
countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer 
maxAllowedDist) {
+    this.nameCodesMap = nameCodesMap;
+    setDominantCode(countryHits);
+    for (LinkedSpan<BaseLink> linkedspan : linkedData) {
+
+      for (BaseLink link : linkedspan.getLinkedEntries()) {
+        Double dice = FuzzyStringMatcher.getDiceCoefficient(linkedspan.getSearchTerm().toLowerCase().replace(" 
", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);
+        /**
+         * Since MySQL is using "boolean mode" this score will always be very
+         * high. To allow more recall, change mysql to "natural language mode",
+         * and this score will become more significant
+         */
+        link.setFuzzyStringMatchingScore(dice);
+
+      }
+      linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, 
maxAllowedDist);
+    }
+    return linkedData;
+  }
+/**
+ * sets class level variable to a code based on the number of mentions
+ * @param countryHits
+ */
+  private void setDominantCode(Map<String, Set<Integer>> countryHits) {
+    int hits = -1;
+    for (String code : countryHits.keySet()) {
+      if (countryHits.get(code).size() > hits) {
+        hits = countryHits.get(code).size();
+        dominantCode = code;
+      }
+    }
+  }
+
+  /**
+   * Generates distances from each country mention to the span's location in 
the
+   * doc text. Ultimately an attempt to ensure that ambiguously named toponyms
+   * are resolved to the correct country and coordinate.
+   *
+   * @param sentences
+   * @param countryHits
+   * @param span
+   * @return
+   */
+  private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, 
Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
+    //get the index of the actual span, begining of sentence
+    //should generate tokens from sentence and create a char offset...
+    //could have large sentences due to poor sentence detection or wonky doc 
text
+    int sentenceIdx = span.getSentenceid();
+    int sentIndexInDoc = sentences[sentenceIdx].getStart();
+    /**
+     * create a map of all the span's proximal country mentions in the document
+     * Map< countrycode, set of <distances from this NamedEntity>>
+     */
+    Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, 
Set<Integer>>();
+    //map = Map<countrycode, Set <of distances this span is from all the mentions of 
the code>>
+    for (String cCode : countryHits.keySet()) {
+//iterate over all the regex start values and calculate an offset
+      for (Integer cHit : countryHits.get(cCode)) {
+        Integer absDist = Math.abs(sentIndexInDoc - cHit);
+        //only include near mentions based on a heuristic
+        //TODO make this a property
+        //  if (absDist < maxAllowedDistance) {
+        if (distancesFromCodeMap.containsKey(cCode)) {
+          distancesFromCodeMap.get(cCode).add(absDist);
+        } else {
+          HashSet<Integer> newset = new HashSet<Integer>();
+          newset.add(absDist);
+          distancesFromCodeMap.put(cCode, newset);
+        }
+      }
+
+      //}
+    }
+    //we now know how far this named entity is from every country mention in 
the document
+
+    /**
+     * the gaz matches that have a country code that have mentions in the doc
+     * that are closest to the Named Entity should return the best score 
Analyze
+     * map generates a likelihood score that the toponym from the gaz is
+     * referring to one of the countries Map<countrycode, prob that this span 
is
+     * referring to the toponym form this code key>
+     */
+    Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, 
span);
+    for (BaseLink link : span.getLinkedEntries()) {
+      //getItemParentId is the country code
+      String spanCountryCode = link.getItemParentID();
+      if (scoreMap.containsKey(spanCountryCode)) {
+        link.setScore(scoreMap.get(spanCountryCode));
+        ///does the name extracted match a country name?
+        if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {
+          //if so, is it the correct country code for that name
+          if 
(nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID()))
 {
+            //boost the score becuase it is likely that this is the location 
in the text, so add 50% to the score or set to 1
+            //TODO: make this multiplier configurable
+            //TODO: improve this with a geographic/geometry based clustering 
(linear binning to be more precise) of points returned from the gaz
+            Double score = (link.getScore() + .75) > 1.0 ? 1d : 
(link.getScore() + .75);
+            //boost the score if the hit is from the dominant country context
+
+            if(link.getItemParentID().equals(dominantCode)){
+              score = (score + .25) > 1.0 ? 1d : (score + .25);
+            }
+            link.setScore(score);
+
+          }
+
+        }
+      }
+    }
+    return span;
+  }
+
+  /**
+   * takes a map of distances from the NE to each country mention and generates
+   * a map of scores for each country code. The map is then correlated to teh
+   * correlated to the code of the BaseLink parentid for retrieval. Then the
+   * score is added to the overall.
+   *
+   * @param distanceMap
+   * @param sentences
+   * @param span
+   * @return
+   */
+  private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, 
Span[] sentences, LinkedSpan<BaseLink> span) {
+
+    Map<String, Double> scoreMap = new HashMap<String, Double>();
+    TreeSet<Integer> all = new TreeSet<Integer>();
+    for (String key : distanceMap.keySet()) {
+      all.addAll(distanceMap.get(key));
+    }
+    //get min max for normalization, this could be more efficient
+    Integer min = all.first();
+    Integer max = all.last();
+    for (String key : distanceMap.keySet()) {
+
+      TreeSet<Double> normalizedDistances = new TreeSet<Double>();
+      for (Integer i : distanceMap.get(key)) {
+        Double norm = normalize(i, min, max);
+        //reverse the normed distance so low numbers (closer) are better
+        //this could be improved with a "decaying " function using an 
imcreaseing negative exponent
+        Double reverse = Math.abs(norm - 1);
+        normalizedDistances.add(reverse);
+      }
+
+
+      List<Double> doubles = new ArrayList<Double>(normalizedDistances);
+      scoreMap.put(key, slidingDistanceAverage(doubles));
+    }
+    return scoreMap;
+  }
+
+  /**
+   * this method is an attempt to make closer clusters of mentions group
+   * together to smooth out the average, so one distant outlier does not kill
+   * the score for an obviously good hit. More elegant solution is possible
+   * using Math.pow, and making the score decay with distance by using an
+   * increasing negative exponent
+   *
+   * @param normDis the normalized and sorted set of distances as a list
+   * @return
+   */
+  private Double slidingDistanceAverage(List<Double> normDis) {
+    List<Double> windowOfAverages = new ArrayList<Double>();
+
+    if (normDis.size() < 3) {
+      windowOfAverages.addAll(normDis);
+    } else {
+
+      for (int i = 0; i < normDis.size() - 1; i++) {
+        double a = normDis.get(i);
+        double b = normDis.get(i + 1);
+        windowOfAverages.add((a + b) / 2);
+
+      }
+    }
+    double sum = 0d;
+    for (double d : windowOfAverages) {
+      sum += d;
+    }
+    double result = sum / windowOfAverages.size();
+    //TODO: ++ prob when large amounts of mentions for a code
+    //System.out.println("avg of window:" + result);
+    return result;
+  }
+
+  /**
+   * transposes a value within one range to a relative value in a different
+   * range. Used to normalize distances in this class.
+   *
+   * @param valueToNormalize the value to place within the new range
+   * @param minimum          the min of the set to be transposed
+   * @param maximum          the max of the set to be transposed
+   * @return
+   */
+  private Double normalize(int valueToNormalize, int minimum, int maximum) {
+    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - 
minimum) + 0;
+    d = d == null ? 0d : d;
+    return d;
+  }
+}

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazEntry.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazEntry.java?rev=1533959&r1=1533958&r2=1533959&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazEntry.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazEntry.java
 Sun Oct 20 20:04:41 2013
@@ -18,59 +18,31 @@ package opennlp.tools.entitylinker;
  import opennlp.tools.entitylinker.domain.BaseLink;

/**

- *
+ *Stores an entry from the NGA Geonames gazateer

*/

  public class MySQLGeoNamesGazEntry extends BaseLink
  {
-  ////actual fields returned
-//ufi,
-//latitude,
-//longitude,
-//cc1,
-//adm1,
-//dsg,
-//SHORT_FORM ,
-//     SORT_NAME_RO ,
-//     FULL_NAME_RO ,
-//     FULL_NAME_ND_RO ,
-//     SORT_NAME_RG ,
-//     FULL_NAME_RG ,
-//     FULL_NAME_ND_RG ,
-//match(`SHORT_FORM` ,`SORT_NAME_RO`,`FULL_NAME_RO`,`FULL_NAME_ND_RO` 
,`SORT_NAME_RG` ,`FULL_NAME_RG` ,`FULL_NAME_ND_RG`)
-//against(pSearch in natural language mode) as rank
-
-  ///////
-
- // private String RC;// VARCHAR(150) NULL DEFAULT NULL,
+
    private String UFI;
-  //private String UNI;
+
    private Double LATITUDE; //DOUBLE NULL DEFAULT NULL,
    private Double LONGITUDE;// DOUBLE NULL DEFAULT NULL,
- // private String DMS_LAT;// VARCHAR(150) NULL DEFAULT NULL,
- // private String DMS_LONG;// VARCHAR(150) NULL DEFAULT NULL,
- // private String MGRS;// VARCHAR(150) NULL DEFAULT NULL,
-//  private String JOG;// VARCHAR(150) NULL DEFAULT NULL,
- // private String FC;// VARCHAR(150) NULL DEFAULT NULL,
+
    private String DSG;// VARCHAR(150) NULL DEFAULT NULL,
- // private String PC;// VARCHAR(150) NULL DEFAULT NULL,
+
    private String CC1;//` VARCHAR(150) NULL DEFAULT NULL,
    private String ADM1;// VARCHAR(150) NULL DEFAULT NULL,
- // private String POP;// VARCHAR(150) NULL DEFAULT NULL,
-  //private String ELEV;//VARCHAR(150) NULL DEFAULT NULL,
-//  private String CC2;// VARCHAR(150) NULL DEFAULT NULL,
- // private String NT;//VARCHAR(150) NULL DEFAULT NULL,
- // private String LC;// VARCHAR(150) NULL DEFAULT NULL,
+
    private String SHORT_FORM;// VARCHAR(500) NULL DEFAULT NULL,
- // private String GENERIC;// VARCHAR(150) NULL DEFAULT NULL,
+
    private String SORT_NAME_RO;//VARCHAR(500) NULL DEFAULT NULL,
    private String FULL_NAME_RO;// VARCHAR(500) NULL DEFAULT NULL,
    private String FULL_NAME_ND_RO;// VARCHAR(500) NULL DEFAULT NULL,
    private String SORT_NAME_RG;// VARCHAR(500) NULL DEFAULT NULL,
    private String FULL_NAME_RG;// VARCHAR(500) NULL DEFAULT NULL,
    private String FULL_NAME_ND_RG;// VARCHAR(500) NULL DEFAULT NULL,
-//  private String NOTE;//VARCHAR(500) NULL DEFAULT NULL,
- // private String MODIFY_DATE;// VARCHAR(150) NULL DEFAULT NULL,
+
  private Double rank;

public String getUFI()


Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazLinkable.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazLinkable.java?rev=1533959&r1=1533958&r2=1533959&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazLinkable.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazLinkable.java
 Sun Oct 20 20:04:41 2013
@@ -1,17 +1,13 @@
  package opennlp.tools.entitylinker;

-/**

- *
- * @author Owner
- */
+
  import java.sql.CallableStatement;
  import java.sql.Connection;
  import java.sql.DriverManager;
  import java.sql.ResultSet;
  import java.sql.SQLException;
  import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
+import java.util.Map;
  import java.util.Set;
  import java.util.logging.Level;
  import java.util.logging.Logger;
@@ -20,7 +16,7 @@ import opennlp.tools.util.Span;

/**

   *
- *
+ *Links names to the NGA gazateer
   */
  public final class MySQLGeoNamesGazLinkable {

@@ -30,7 +26,7 @@ public final class MySQLGeoNamesGazLinka

    public MySQLGeoNamesGazLinkable() {
    }

- public ArrayList<BaseLink> find(String locationText, Span span, List<CountryContextHit> countryHits, EntityLinkerProperties properties) {

+  public ArrayList<BaseLink> find(String locationText, Span span, Map<String, 
Set<Integer>> countryHits, EntityLinkerProperties properties) {
      ArrayList<BaseLink> returnlocs = new ArrayList<BaseLink>();

try {

@@ -40,13 +36,13 @@ public final class MySQLGeoNamesGazLinka
        //   pull from config to utilize country context filtering
        filterCountryContext = 
Boolean.valueOf(properties.getProperty("geoentitylinker.filter_by_country_context", 
"false"));

- Set<String> countrycodes = getCountryCodes(countryHits);

+
        String thresh = properties.getProperty("mysqlusgsgazscorethresh", "25");
        int threshhold = -1;
        if (!thresh.matches("[azAZ]")) {
          threshhold = Integer.valueOf(thresh);
        }
-      returnlocs.addAll(this.searchGaz(locationText, threshhold, countrycodes, 
properties));
+      returnlocs.addAll(this.searchGaz(locationText, threshhold, 
countryHits.keySet(), properties));

} catch (Exception ex) {

@@ -56,7 +52,7 @@ public final class MySQLGeoNamesGazLinka
    }

protected Connection getMySqlConnection(EntityLinkerProperties property) throws Exception {

-   // EntityLinkerProperties property = new EntityLinkerProperties(new 
File("c:\\temp\\opennlpmodels\\entitylinker.properties"));
+    // EntityLinkerProperties property = new EntityLinkerProperties(new 
File("c:\\temp\\opennlpmodels\\entitylinker.properties"));
      String driver = property.getProperty("mysql.driver", 
"org.gjt.mm.mysql.Driver");
      String url = property.getProperty("mysql.url", 
"jdbc:mysql://localhost:3306/world");
      String username = property.getProperty("mysql.username", "root");
@@ -73,16 +69,23 @@ public final class MySQLGeoNamesGazLinka
        con = getMySqlConnection(properties);
      }
      CallableStatement cs;
-    cs = con.prepareCall("CALL `search_geonames`(?, ?)");
+    cs = con.prepareCall("CALL `search_geonames`(?, ?, ?)");
      cs.setString(1, this.format(searchString));
      cs.setInt(2, matchthresh);
-    ArrayList<MySQLGeoNamesGazEntry> retLocs = new 
ArrayList<MySQLGeoNamesGazEntry>();
+    if (filterCountryContext) {
+      cs.setString(3,CountryContext.getCountryCodeCSV(countryCodes));
+    } else {
+      //database stored procedure handles empty string
+      cs.setString(3, "");
+    }
+
+    ArrayList<MySQLGeoNamesGazEntry> toponyms = new 
ArrayList<MySQLGeoNamesGazEntry>();
      ResultSet rs;
      try {
        rs = cs.executeQuery();

if (rs == null) {

-        return retLocs;
+        return toponyms;
        }

while (rs.next()) {

@@ -117,17 +120,13 @@ public final class MySQLGeoNamesGazLinka

s.setRank(rs.getDouble(14));- if (filterCountryContext) {

-          if (countryCodes.contains(s.getCC1().toLowerCase())) {
-          //  System.out.println(searchString +" GeoNames qualified on: " + 
s.getCC1());
-            s.setRank(s.getRank() + 1.0);
-          } else {
-         //    System.out.println(s.getFULL_NAME_ND_RO() + ", with CC1 of "+ s.getCC1()+ 
", is not within countries discovered in the document. The Country list used to discover 
countries can be modified in mysql procedure getCountryList()");
-            continue;
-          }
-        }
-
-        retLocs.add(s);
+            //set the base link data
+        s.setItemName(s.getFULL_NAME_ND_RO().toLowerCase().trim());
+        s.setItemID(s.getUFI());
+        s.setItemType(s.getDSG());
+        s.setItemParentID(s.getCC1().toLowerCase());
+
+        toponyms.add(s);
        }

} catch (SQLException ex) {

@@ -138,16 +137,10 @@ public final class MySQLGeoNamesGazLinka
        con.close();
      }

- return retLocs;

+    return toponyms;
    }

- private Set<String> getCountryCodes(List<CountryContextHit> hits) {

-    Set<String> ccs = new HashSet<String>();
-    for (CountryContextHit hit : hits) {
-      ccs.add(hit.getCountryCode().toLowerCase());
-    }
-    return ccs;
-  }
+

public String format(String entity) {

      return "\"" + entity + "\"";

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazEntry.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazEntry.java?rev=1533959&r1=1533958&r2=1533959&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazEntry.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazEntry.java
 Sun Oct 20 20:04:41 2013
@@ -18,7 +18,7 @@ package opennlp.tools.entitylinker;
  import opennlp.tools.entitylinker.domain.BaseLink;

/**

- *
+ *Stores an entry from the USGS gazateer

*/

  public class MySQLUSGSGazEntry extends BaseLink

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazLinkable.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazLinkable.java?rev=1533959&r1=1533958&r2=1533959&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazLinkable.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazLinkable.java
 Sun Oct 20 20:04:41 2013
@@ -23,6 +23,7 @@ import java.sql.SQLException;
  import java.util.ArrayList;
  import java.util.HashSet;
  import java.util.List;
+import java.util.Map;
  import java.util.Set;
  import java.util.logging.Level;
  import java.util.logging.Logger;
@@ -30,8 +31,7 @@ import opennlp.tools.entitylinker.domain
  import opennlp.tools.util.Span;

/**

- *
- * @author opennlp
+ * Links names to the USGS gazateer
   */
  public class MySQLUSGSGazLinkable {

@@ -41,12 +41,12 @@ public class MySQLUSGSGazLinkable {

    public MySQLUSGSGazLinkable() {
    }

- public ArrayList<BaseLink> find(String locationText, Span span, List<CountryContextHit> countryHits, EntityLinkerProperties properties) {

+  public ArrayList<BaseLink> find(String locationText, Span span, Map<String, 
Set<Integer>> countryHits, EntityLinkerProperties properties) {
      ArrayList<BaseLink> returnlocs = new ArrayList<BaseLink>();
      try {
        filterCountryContext = 
Boolean.valueOf(properties.getProperty("geoentitylinker.filter_by_country_context", 
"false"));
        //the usgs gazateer only has us geonames, so only use it if the user 
doesn't care about country isolation or the hits contain us
-      if (getCountryCodes(countryHits).contains("us") || 
!filterCountryContext) {
+      if (countryHits.keySet().contains("us") || !filterCountryContext) {

if (con == null) {

            con = getMySqlConnection(properties);
@@ -56,7 +56,7 @@ public class MySQLUSGSGazLinkable {
          if (!thresh.matches("[azAZ]")) {
            threshhold = Integer.valueOf(thresh);
          }
-        returnlocs.addAll(this.searchGaz(locationText, threshhold, 
getCountryCodes(countryHits), properties));
+        returnlocs.addAll(this.searchGaz(locationText, threshhold, 
countryHits.keySet(), properties));
        }
      } catch (Exception ex) {
        
Logger.getLogger(MySQLUSGSGazLinkable.class.getName()).log(Level.SEVERE, null, 
ex);
@@ -84,13 +84,13 @@ public class MySQLUSGSGazLinkable {
      cs = con.prepareCall("CALL `search_gaz`(?, ?)");
      cs.setString(1, this.format(searchString));
      cs.setInt(2, matchthresh);
-    ArrayList<MySQLUSGSGazEntry> retUrls = new ArrayList<MySQLUSGSGazEntry>();
+    ArrayList<MySQLUSGSGazEntry> toponyms = new ArrayList<MySQLUSGSGazEntry>();
      ResultSet rs;
      try {
        rs = cs.executeQuery();

if (rs == null) {

-        return retUrls;
+        return toponyms;
        }

while (rs.next()) {

@@ -99,21 +99,20 @@ public class MySQLUSGSGazLinkable {

s.setFeatureid(String.valueOf(rs.getLong(2)));

          s.setFeaturename(rs.getString(3));
+
          s.setFeatureclass(rs.getString(4));
          s.setStatealpha(rs.getString(5));
          s.setPrimarylatitudeDEC(rs.getDouble(6));
          s.setPrimarylongitudeDEC(rs.getDouble(7));
          s.setMapname(rs.getString(8));
-        if (countryCodes.contains("us")) {
-          s.setRank(s.getRank() + (s.getRank() * .5));
-         // System.out.println(searchString +"USGS qualified on: " + 
s.getFeaturename());
-        } else {
-          s.setRank(s.getRank() * .5);
-          if(filterCountryContext){
-            continue;
-          }
-        }
-        retUrls.add(s);
+
+        //set the base link data
+        s.setItemName(s.getFeaturename().toLowerCase().trim());
+        s.setItemID(s.getFeatureid());
+        s.setItemType(s.getFeatureclass());
+        s.setItemParentID("us");
+
+        toponyms.add(s);
        }

} catch (SQLException ex) {

@@ -124,7 +123,7 @@ public class MySQLUSGSGazLinkable {
        con.close();
      }

- return retUrls;

+    return toponyms;
    }

private Set<String> getCountryCodes(List<CountryContextHit> hits) {


Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/domain/BaseLink.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/domain/BaseLink.java?rev=1533959&r1=1533958&r2=1533959&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/domain/BaseLink.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/domain/BaseLink.java
 Sun Oct 20 20:04:41 2013
@@ -13,29 +13,48 @@
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
-
  package opennlp.tools.entitylinker.domain;

/**

   * Stores a minimal tuple of information. Intended to be used with LinkedSpan
   *
-
+ *
   */
  public abstract class BaseLink {

+ private String itemParentID;

    private String itemID;
    private String itemName;
    private String itemType;
+  private Double score;
+  private Double fuzzyStringMatchingScore;

public BaseLink() {

- public BaseLink(String itemID, String itemName, String itemType) {

+  public BaseLink(String itemParentID, String itemID, String itemName, String 
itemType) {
+    this.itemParentID = itemParentID;
      this.itemID = itemID;
      this.itemName = itemName;
      this.itemType = itemType;
    }

+ public Double getScore() {

+    return score;
+  }
+
+  public void setScore(Double score) {
+    this.score = score;
+  }
+
+  public String getItemParentID() {
+    return itemParentID;
+  }
+
+  public void setItemParentID(String itemParentID) {
+    this.itemParentID = itemParentID;
+  }
+
    /**
     * returns the itemid
     *
@@ -93,10 +112,16 @@ public abstract class BaseLink {
      this.itemType = itemType;
    }

-

-
    @Override
    public String toString() {
      return "BaseLink{" + "itemID=" + itemID + ", itemName=" + itemName + ", 
itemType=" + itemType + '}';
    }
+
+  public Double getFuzzyStringMatchingScore() {
+    return fuzzyStringMatchingScore;
+  }
+
+  public void setFuzzyStringMatchingScore(Double fuzzyStringMatchingScore) {
+    this.fuzzyStringMatchingScore = fuzzyStringMatchingScore;
+  }
  }
\ No newline at end of file

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramGenerator.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramGenerator.java?rev=1533959&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramGenerator.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramGenerator.java
 Sun Oct 20 20:04:41 2013
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.ngram;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Generates an nGram, with optional separator, and returns the grams as a list
+ * of strings
+ */
+public class NGramGenerator {
+
+
+  /**
+   * Creates an ngram separated
+   * by the separator param value i.e. a,b,c,d with n = 3 and separator = "-"
+   * would return a-b-c,b-c-d
+   *
+   * @param input     the input tokens the output ngrams will be derived from
+   * @param n         the number of tokens as the sliding window
+   * @param separator each string in each gram will be separated by this value 
if desired. Pass in empty string if no separator is desired
+   * @return
+   */
+  public static List<String> generate(List<String> input, int n, String 
separator) {
+
+    List<String> outGrams = new ArrayList<String>();
+    for (int i = 0; i < input.size() - (n - 2); i++) {
+      String gram = "";
+      if ((i + n) <= input.size()) {
+        for (int x = i; x < (n + i); x++) {
+          gram += input.get(x) + separator;
+        }
+        gram = gram.substring(0, gram.lastIndexOf(separator));
+        outGrams.add(gram);
+      }
+    }
+    return outGrams;
+  }
+/**
+ *Generates an nGram based on a char[] input
+ * @param input the array of chars to convert to nGram
+ * @param n The number of grams (chars) that each output gram will consist of
+ * @param separator each char in each gram will be separated by this value if 
desired. Pass in empty string if no separator is desired
+ * @return
+ */
+  public static List<String> generate(char[] input, int n, String separator) {
+
+    List<String> outGrams = new ArrayList<String>();
+    for (int i = 0; i < input.length - (n - 2); i++) {
+      String gram = "";
+      if ((i + n) <= input.length) {
+        for (int x = i; x < (n + i); x++) {
+          gram += input[x] + separator;
+        }
+        gram = gram.substring(0, gram.lastIndexOf(separator));
+        outGrams.add(gram);
+      }
+    }
+    return outGrams;
+  }
+}

Re: svn commit: r1533959 - in /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: entitylinker/ entitylinker/domain/ ngram/

Reply via email to