[2/4] opennlp-addons git commit: Fix checkstyle errors in geoentitylinker

joern Mon, 24 Apr 2017 06:20:43 -0700

http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
index f457822..027efc2 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
@@ -1,113 +1,115 @@
-/*
- * Copyright 2014 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.indexing;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.util.ArrayList;
-import java.util.List;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.IndexWriter;
-
-public class RegionProcessor {
-
-  public static void main(String[] args) {
-    RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new 
File("C:\\temp\\gazetteers\\testRegionContext.txt"), null);
-  }
-
-  /**
-   *
-   * @param regionsFile the file that stores Region references. the format of
-   * this file is tab delimitted text with index 0 as the name of the region,
-   * index 1 as the longitude, and index 2 as the latitude
-   * @param outputCountryContextfile this is the country context files shared 
by
-   * all indexing processors
-   * @param w
-   */
-  public static void process(File regionsFile, File outputCountryContextfile, 
IndexWriter w) {
-    try {
-      readFile(regionsFile, outputCountryContextfile, w);
-    } catch (Exception ex) {
-      ex.printStackTrace();
-    }
-  }
-
-  public static void readFile(File gazateerInputData, File 
outputCountryContextfile, IndexWriter w) throws Exception {
-    List<String> ccfileentries = new ArrayList<>();
-    BufferedReader reader = new BufferedReader(new 
FileReader(gazateerInputData));
-    List<String> fields = new ArrayList<>();
-    int counter = 0;
-    System.out.println("reading gazetteer data from Regions file...........");
-    String line = "";
-    while ((line = reader.readLine()) != null) {
-
-      String[] values = line.split("\t");
-      if (counter == 0) {
-
-      } else {
-        Document doc = new Document();
-        for (int i = 0; i < fields.size() - 1; i++) {
-          doc.add(new TextField(fields.get(i), values[i].trim(), 
Field.Store.YES));
-        }
-        String placeName = values[0];
-        String lat = values[2];
-        String lon = values[1];
-        String dsg = "region";
-        String id = "rg" + counter;
-
-        String hierarchy = placeName;
-
-        doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
-        doc.add(new TextField("placename", placeName, Field.Store.YES));
-        doc.add(new StringField("latitude", lat, Field.Store.YES));
-        doc.add(new StringField("longitude", lon, Field.Store.YES));
-        doc.add(new StringField("loctype", dsg, Field.Store.YES));
-        doc.add(new StringField("admincode", "", Field.Store.YES));
-        doc.add(new StringField("countrycode", id, Field.Store.YES));
-        doc.add(new StringField("countycode", "", Field.Store.YES));
-
-        doc.add(new StringField("locid", id, Field.Store.YES));
-        doc.add(new StringField("gazsource", "region", Field.Store.YES));
-        //countrycontext file format
-        // US  KY      131     United States   Kentucky        Leslie
-
-        ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" 
+ "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\t" + "("
-            + placeName + ")" + "\t" + "NO_DATA_FOUND" + "\t" + 
"NO_DATA_FOUND" + "\n");
-        if (w != null) {
-          w.addDocument(doc);
-        }
-      }
-      counter++;
-
-    }
-    if (w != null) {
-      w.commit();
-    }
-    FileWriter writer = new FileWriter(outputCountryContextfile, true);
-    for (String string : ccfileentries) {
-      writer.write(string);
-    }
-    System.out.println("successfully wrote Region entries to country oontext 
file");
-    writer.close();
-    System.out.println("Completed indexing regions!");
-  }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+
+public class RegionProcessor {
+
+  public static void main(String[] args) {
+    RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new 
File("C:\\temp\\gazetteers\\testRegionContext.txt"), null);
+  }
+
+  /**
+   *
+   * @param regionsFile the file that stores Region references. the format of
+   * this file is tab delimitted text with index 0 as the name of the region,
+   * index 1 as the longitude, and index 2 as the latitude
+   * @param outputCountryContextfile this is the country context files shared 
by
+   * all indexing processors
+   * @param w
+   */
+  public static void process(File regionsFile, File outputCountryContextfile, 
IndexWriter w) {
+    try {
+      readFile(regionsFile, outputCountryContextfile, w);
+    } catch (Exception ex) {
+      ex.printStackTrace();
+    }
+  }
+
+  public static void readFile(File gazateerInputData, File 
outputCountryContextfile, IndexWriter w) throws Exception {
+    List<String> ccfileentries = new ArrayList<>();
+    BufferedReader reader = new BufferedReader(new 
FileReader(gazateerInputData));
+    List<String> fields = new ArrayList<>();
+    int counter = 0;
+    System.out.println("reading gazetteer data from Regions file...........");
+    String line = "";
+    while ((line = reader.readLine()) != null) {
+
+      String[] values = line.split("\t");
+      if (counter == 0) {
+
+      } else {
+        Document doc = new Document();
+        for (int i = 0; i < fields.size() - 1; i++) {
+          doc.add(new TextField(fields.get(i), values[i].trim(), 
Field.Store.YES));
+        }
+        String placeName = values[0];
+        String lat = values[2];
+        String lon = values[1];
+        String dsg = "region";
+        String id = "rg" + counter;
+
+        String hierarchy = placeName;
+
+        doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
+        doc.add(new TextField("placename", placeName, Field.Store.YES));
+        doc.add(new StringField("latitude", lat, Field.Store.YES));
+        doc.add(new StringField("longitude", lon, Field.Store.YES));
+        doc.add(new StringField("loctype", dsg, Field.Store.YES));
+        doc.add(new StringField("admincode", "", Field.Store.YES));
+        doc.add(new StringField("countrycode", id, Field.Store.YES));
+        doc.add(new StringField("countycode", "", Field.Store.YES));
+
+        doc.add(new StringField("locid", id, Field.Store.YES));
+        doc.add(new StringField("gazsource", "region", Field.Store.YES));
+        //countrycontext file format
+        // US  KY      131     United States   Kentucky        Leslie
+
+        ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" 
+ "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\t" + "("
+            + placeName + ")" + "\t" + "NO_DATA_FOUND" + "\t" + 
"NO_DATA_FOUND" + "\n");
+        if (w != null) {
+          w.addDocument(doc);
+        }
+      }
+      counter++;
+
+    }
+    if (w != null) {
+      w.commit();
+    }
+    FileWriter writer = new FileWriter(outputCountryContextfile, true);
+    for (String string : ccfileentries) {
+      writer.write(string);
+    }
+    System.out.println("successfully wrote Region entries to country oontext 
file");
+    writer.close();
+    System.out.println("Completed indexing regions!");
+  }
+
+}


http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
index fcd61c1..61b2120 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
@@ -1,251 +1,254 @@
-/*
- * Copyright 2014 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.indexing;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.addons.geoentitylinker.AdminBoundary;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
-
-import org.apache.lucene.index.IndexWriter;
-
-public class USGSProcessor {
-
-  public static void main(String[] args) {
-    try {
-      Map<String, AdminBoundary> provData = getProvData(new 
File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), 
GazetteerIndexer.GazType.USGS);
-      process(new 
File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), new 
File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"), null, null);
-    } catch (Exception ex) {
-      Logger.getLogger(USGSProcessor.class.getName()).log(Level.SEVERE, null, 
ex);
-    }
-  }
-
-  public static void process(File lookupData, File usgsGazDataFile, File 
outputCountryContextfile, IndexWriter w) throws Exception {
-    Map<String, AdminBoundary> provData = getProvData(lookupData, 
GazetteerIndexer.GazType.USGS);
-    readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData);
-    writeCountryContextFile(outputCountryContextfile, provData);
-  }
-
-  public static void readFile(File gazateerInputData, IndexWriter w, 
GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws 
Exception {
-
-    Map<String, StateCentroid> states = new HashMap<>();
-    BufferedReader reader = new BufferedReader(new 
FileReader(gazateerInputData));
-    List<String> fields = new ArrayList<>();
-    int counter = 0;
-    System.out.println("reading gazetteer data from USGS file...........");
-    String line = "";
-    while ((line = reader.readLine()) != null) {
-
-      String[] values = line.split(type.getSeparator());
-      if (counter == 0) {
-        for (String columnName : values) {
-          fields.add(columnName.replace("Â»Â¿", "").trim());
-        }
-
-      } else {
-        Document doc = new Document();
-        for (int i = 0; i < fields.size() - 1; i++) {
-          doc.add(new TextField(fields.get(i), values[i].trim(), 
Field.Store.YES));
-        }
-        String placeName = values[1];
-        String lat = values[9];
-        String lon = values[10];
-        String dsg = values[2];
-        String id = values[0];
-
-        String ccode = values[6];
-        String admincode = values[3];
-        AdminBoundary get = lookupMap.get(admincode + "." + ccode);
-        String countyname = "";
-        if (get == null) {
-          System.out.println("null...continuing to index" + " ccode: " + ccode 
+ " , admincode: " + admincode + " , placename: " + placeName);
-          continue;
-
-        }
-        String countyCode = get.getCountyCode();
-
-        if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {
-          countyname = get.getCountyName();
-        }
-        if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) {
-          countyCode = get.getCountyCode();
-        }
-        String hierarchy = get.getCountryName() + ", " + get.getProvinceName() 
+ ", " + countyname + ", " + placeName;
-
-        if (states.containsKey(get.getProvinceName())) {
-          StateCentroid entry = states.get(get.getProvinceName());
-          entry.count++;
-          entry.latSum += Double.valueOf(lat);
-          entry.longSum += Double.valueOf(lon);
-        } else {
-          StateCentroid centroid = new StateCentroid();
-          centroid.statecode = get.getProvCode();
-          centroid.count = 1;
-          centroid.latSum = Double.valueOf(lat);
-          centroid.longSum = Double.valueOf(lon);
-          states.put(get.getProvinceName(), centroid);
-        }
-
-        doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
-        doc.add(new TextField("placename", placeName, Field.Store.YES));
-        doc.add(new TextField("latitude", lat, Field.Store.YES));
-        doc.add(new TextField("longitude", lon, Field.Store.YES));
-        doc.add(new StringField("loctype", dsg, Field.Store.YES));
-        doc.add(new StringField("admincode", (get.getCountryCode() + "." + 
get.getProvCode()).toLowerCase(), Field.Store.YES));
-        doc.add(new StringField("countrycode", 
get.getCountryCode().toLowerCase(), Field.Store.YES));
-        doc.add(new StringField("countycode", (get.getCountryCode() + "." + 
get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES));
-
-        doc.add(new StringField("locid", id, Field.Store.YES));
-        doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
-        w.addDocument(doc);
-      }
-      counter++;
-      if (counter % 100000 == 0) {
-        w.commit();
-        System.out.println(counter + " .........USGS entries committed to 
index..............");
-      }
-
-    }
-
-    for (String state : states.keySet()) {
-      StateCentroid get = states.get(state);
-      Document doc = new Document();
-      doc.add(new TextField("hierarchy", "united states, " + state, 
Field.Store.YES));
-      doc.add(new TextField("placename", state, Field.Store.YES));
-      //calculate a centroid for all the points that were in the state
-      doc.add(new TextField("latitude", (get.latSum / get.count) + "", 
Field.Store.YES));
-      doc.add(new TextField("longitude", (get.longSum / get.count) + "", 
Field.Store.YES));
-      doc.add(new StringField("loctype", "adm1", Field.Store.YES));
-      doc.add(new StringField("admincode", get.statecode, Field.Store.YES));
-      doc.add(new StringField("countrycode", "us", Field.Store.YES));
-      doc.add(new StringField("countycode", "", Field.Store.YES));
-
-      doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES));
-      doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
-      w.addDocument(doc);
-
-      // System.out.println(get.statecode + "," + (get.latSum / get.count) + 
"," + (get.longSum / get.count));
-    }
-    Document doc = new Document();
-    doc.add(new TextField("hierarchy", "united states", Field.Store.YES));
-    doc.add(new TextField("placename", "united states", Field.Store.YES));
-    //calculate a centroid for all the points that were in the state
-    doc.add(new TextField("latitude", 39.0 + "", Field.Store.YES));
-    doc.add(new TextField("longitude", -103.0 + "", Field.Store.YES));
-    doc.add(new StringField("loctype", "pcli", Field.Store.YES));
-    doc.add(new StringField("admincode", "", Field.Store.YES));
-    doc.add(new StringField("countrycode", "us", Field.Store.YES));
-    doc.add(new StringField("countycode", "", Field.Store.YES));
-
-    doc.add(new StringField("locid", "us_centroid" + "unitedstates", 
Field.Store.YES));
-    doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
-    //System.out.println("uscentroid," + (sumofLatSums / sumOfCounts) + "," + 
(sumofLonSums / sumOfCounts));
-
-    w.addDocument(doc);
-    w.commit();
-
-    System.out.println("Completed indexing USGS gaz!");
-  }
-
-  private static class StateCentroid {
-
-    double latSum;
-    double longSum;
-    String statecode;
-    int count;
-  }
-
-  private static Map<String, AdminBoundary> getProvData(File govUnitsFile, 
GazetteerIndexer.GazType type) {
-    System.out.println("Attempting to read USGS province (State) data from: " 
+ govUnitsFile.getPath());
-    Map<String, AdminBoundary> outmap = new HashMap<>();
-    BufferedReader reader;
-
-    try {
-
-      reader = new BufferedReader(new FileReader(govUnitsFile));
-      int i = 0;
-      String line = "";
-      String[] fields = null;
-      while ((line = reader.readLine()) != null) {
-
-        String[] values = line.split(type.getSeparator());
-        if (i == 0) {
-          fields = values;
-          i++;
-          continue;
-        }
-        i++;
-        // System.out.println(i);
-        String countyCode = values[2];
-        String countyName = values[3];
-        String stateCode = values[5];
-        String stateName = values[6];
-        String countryCode = values[7];
-        String countryName = values[8];
-        AdminBoundary adminBoundary = new AdminBoundary(countryCode, 
countryName, stateCode, stateName, countyCode, countyName, null, null, null);
-        outmap.put(stateCode + "." + countyCode, adminBoundary);
-        //  System.out.println(adminBoundary);
-
-      }
-      reader.close();
-    } catch (IOException ex) {
-      ex.printStackTrace();
-    }
-    System.out.println("Successfully read USGS province (State) data from: " + 
govUnitsFile.getPath());
-
-    return outmap;
-
-  }
-
-  public static void writeCountryContextFile(File outfile, Map<String, 
AdminBoundary> adms) {
-    // FileWriter writer = null;
-    try (FileWriter writer = new FileWriter(outfile, true)) {
-
-      for (String admkey : adms.keySet()) {
-        AdminBoundary adm = adms.get(admkey);
-        if (adm == null) {
-          continue;
-        }
-        String province = adm.getProvinceName();
-        String country = adm.getCountryName();
-        /**
-         * this is the standard format of the country context file... Geonames
-         * data will have an empty string for the county
-         */
-        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + 
adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + 
adm.getCountyName() + "\t"
-            + "(U\\.S\\.[ $]|U\\.S\\.A\\.[ $]|United States|the US[ $]|a us[ 
$])" + "\t" + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n";
-        writer.write(line);
-        ///  System.out.println(line);
-
-      }
-    } catch (IOException ex) {
-      Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, 
null, ex);
-    }
-    System.out.println("successfully wrote USGS entries to country context 
file");
-  }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+
+import opennlp.addons.geoentitylinker.AdminBoundary;
+
+public class USGSProcessor {
+
+  public static void main(String[] args) {
+    try {
+      Map<String, AdminBoundary> provData = getProvData(new 
File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), 
GazetteerIndexer.GazType.USGS);
+      process(new 
File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), new 
File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"), null, null);
+    } catch (Exception ex) {
+      Logger.getLogger(USGSProcessor.class.getName()).log(Level.SEVERE, null, 
ex);
+    }
+  }
+
+  public static void process(File lookupData, File usgsGazDataFile, File 
outputCountryContextfile, IndexWriter w) throws Exception {
+    Map<String, AdminBoundary> provData = getProvData(lookupData, 
GazetteerIndexer.GazType.USGS);
+    readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData);
+    writeCountryContextFile(outputCountryContextfile, provData);
+  }
+
+  public static void readFile(File gazateerInputData, IndexWriter w, 
GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws 
Exception {
+
+    Map<String, StateCentroid> states = new HashMap<>();
+    BufferedReader reader = new BufferedReader(new 
FileReader(gazateerInputData));
+    List<String> fields = new ArrayList<>();
+    int counter = 0;
+    System.out.println("reading gazetteer data from USGS file...........");
+    String line = "";
+    while ((line = reader.readLine()) != null) {
+
+      String[] values = line.split(type.getSeparator());
+      if (counter == 0) {
+        for (String columnName : values) {
+          fields.add(columnName.replace("Â»Â¿", "").trim());
+        }
+
+      } else {
+        Document doc = new Document();
+        for (int i = 0; i < fields.size() - 1; i++) {
+          doc.add(new TextField(fields.get(i), values[i].trim(), 
Field.Store.YES));
+        }
+        String placeName = values[1];
+        String lat = values[9];
+        String lon = values[10];
+        String dsg = values[2];
+        String id = values[0];
+
+        String ccode = values[6];
+        String admincode = values[3];
+        AdminBoundary get = lookupMap.get(admincode + "." + ccode);
+        String countyname = "";
+        if (get == null) {
+          System.out.println("null...continuing to index" + " ccode: " + ccode 
+ " , admincode: " + admincode + " , placename: " + placeName);
+          continue;
+
+        }
+        String countyCode = get.getCountyCode();
+
+        if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {
+          countyname = get.getCountyName();
+        }
+        if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) {
+          countyCode = get.getCountyCode();
+        }
+        String hierarchy = get.getCountryName() + ", " + get.getProvinceName() 
+ ", " + countyname + ", " + placeName;
+
+        if (states.containsKey(get.getProvinceName())) {
+          StateCentroid entry = states.get(get.getProvinceName());
+          entry.count++;
+          entry.latSum += Double.valueOf(lat);
+          entry.longSum += Double.valueOf(lon);
+        } else {
+          StateCentroid centroid = new StateCentroid();
+          centroid.statecode = get.getProvCode();
+          centroid.count = 1;
+          centroid.latSum = Double.valueOf(lat);
+          centroid.longSum = Double.valueOf(lon);
+          states.put(get.getProvinceName(), centroid);
+        }
+
+        doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
+        doc.add(new TextField("placename", placeName, Field.Store.YES));
+        doc.add(new TextField("latitude", lat, Field.Store.YES));
+        doc.add(new TextField("longitude", lon, Field.Store.YES));
+        doc.add(new StringField("loctype", dsg, Field.Store.YES));
+        doc.add(new StringField("admincode", (get.getCountryCode() + "." + 
get.getProvCode()).toLowerCase(), Field.Store.YES));
+        doc.add(new StringField("countrycode", 
get.getCountryCode().toLowerCase(), Field.Store.YES));
+        doc.add(new StringField("countycode", (get.getCountryCode() + "." + 
get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES));
+
+        doc.add(new StringField("locid", id, Field.Store.YES));
+        doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
+        w.addDocument(doc);
+      }
+      counter++;
+      if (counter % 100000 == 0) {
+        w.commit();
+        System.out.println(counter + " .........USGS entries committed to 
index..............");
+      }
+
+    }
+
+    for (String state : states.keySet()) {
+      StateCentroid get = states.get(state);
+      Document doc = new Document();
+      doc.add(new TextField("hierarchy", "united states, " + state, 
Field.Store.YES));
+      doc.add(new TextField("placename", state, Field.Store.YES));
+      //calculate a centroid for all the points that were in the state
+      doc.add(new TextField("latitude", (get.latSum / get.count) + "", 
Field.Store.YES));
+      doc.add(new TextField("longitude", (get.longSum / get.count) + "", 
Field.Store.YES));
+      doc.add(new StringField("loctype", "adm1", Field.Store.YES));
+      doc.add(new StringField("admincode", get.statecode, Field.Store.YES));
+      doc.add(new StringField("countrycode", "us", Field.Store.YES));
+      doc.add(new StringField("countycode", "", Field.Store.YES));
+
+      doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES));
+      doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
+      w.addDocument(doc);
+
+      // System.out.println(get.statecode + "," + (get.latSum / get.count) + 
"," + (get.longSum / get.count));
+    }
+    Document doc = new Document();
+    doc.add(new TextField("hierarchy", "united states", Field.Store.YES));
+    doc.add(new TextField("placename", "united states", Field.Store.YES));
+    //calculate a centroid for all the points that were in the state
+    doc.add(new TextField("latitude", 39.0 + "", Field.Store.YES));
+    doc.add(new TextField("longitude", -103.0 + "", Field.Store.YES));
+    doc.add(new StringField("loctype", "pcli", Field.Store.YES));
+    doc.add(new StringField("admincode", "", Field.Store.YES));
+    doc.add(new StringField("countrycode", "us", Field.Store.YES));
+    doc.add(new StringField("countycode", "", Field.Store.YES));
+
+    doc.add(new StringField("locid", "us_centroid" + "unitedstates", 
Field.Store.YES));
+    doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
+    //System.out.println("uscentroid," + (sumofLatSums / sumOfCounts) + "," + 
(sumofLonSums / sumOfCounts));
+
+    w.addDocument(doc);
+    w.commit();
+
+    System.out.println("Completed indexing USGS gaz!");
+  }
+
+  private static class StateCentroid {
+
+    double latSum;
+    double longSum;
+    String statecode;
+    int count;
+  }
+
+  private static Map<String, AdminBoundary> getProvData(File govUnitsFile, 
GazetteerIndexer.GazType type) {
+    System.out.println("Attempting to read USGS province (State) data from: " 
+ govUnitsFile.getPath());
+    Map<String, AdminBoundary> outmap = new HashMap<>();
+    BufferedReader reader;
+
+    try {
+
+      reader = new BufferedReader(new FileReader(govUnitsFile));
+      int i = 0;
+      String line = "";
+      String[] fields = null;
+      while ((line = reader.readLine()) != null) {
+
+        String[] values = line.split(type.getSeparator());
+        if (i == 0) {
+          fields = values;
+          i++;
+          continue;
+        }
+        i++;
+        // System.out.println(i);
+        String countyCode = values[2];
+        String countyName = values[3];
+        String stateCode = values[5];
+        String stateName = values[6];
+        String countryCode = values[7];
+        String countryName = values[8];
+        AdminBoundary adminBoundary = new AdminBoundary(countryCode, 
countryName, stateCode, stateName, countyCode, countyName, null, null, null);
+        outmap.put(stateCode + "." + countyCode, adminBoundary);
+        //  System.out.println(adminBoundary);
+
+      }
+      reader.close();
+    } catch (IOException ex) {
+      ex.printStackTrace();
+    }
+    System.out.println("Successfully read USGS province (State) data from: " + 
govUnitsFile.getPath());
+
+    return outmap;
+
+  }
+
+  public static void writeCountryContextFile(File outfile, Map<String, 
AdminBoundary> adms) {
+    // FileWriter writer = null;
+    try (FileWriter writer = new FileWriter(outfile, true)) {
+
+      for (String admkey : adms.keySet()) {
+        AdminBoundary adm = adms.get(admkey);
+        if (adm == null) {
+          continue;
+        }
+        String province = adm.getProvinceName();
+        String country = adm.getCountryName();
+        /**
+         * this is the standard format of the country context file... Geonames
+         * data will have an empty string for the county
+         */
+        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + 
adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + 
adm.getCountyName() + "\t"
+            + "(U\\.S\\.[ $]|U\\.S\\.A\\.[ $]|United States|the US[ $]|a us[ 
$])" + "\t" + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n";
+        writer.write(line);
+        ///  System.out.println(line);
+
+      }
+    } catch (IOException ex) {
+      Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, 
null, ex);
+    }
+    System.out.println("successfully wrote USGS entries to country context 
file");
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
index aea8f9b..98c9715 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
@@ -1,281 +1,283 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.scoring;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.regex.Pattern;
-import opennlp.addons.geoentitylinker.AdminBoundaryContext;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.BaseLink;
-import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- * Scores toponyms based on their proximity to a country mention. Based on the
- * heuristic that toponymn mentions are more likely close to their parent
- * country mentions. For instance, if the toponym Berlin is mentioned near an
- * indicator of Germany, it is more likely to be Berlin Germany than Berlin
- * Connecticut (if Connecticut is mentioned further down in the article).
- *
- *
- */
-public class CountryProximityScorer implements 
LinkedEntityScorer<AdminBoundaryContext> {
-
-  private Map<String, Set<String>> nameCodesMap;
-  String dominantCode = "";
-  private Map<String, String> regexMap = new HashMap<>();
-
-  @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext 
additionalContext) {
-
-    regexMap = additionalContext.getCountryRegexMap();
-    score(linkedSpans, additionalContext.getCountryMentions(), 
additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
-
-  }
-
-  /**
-   * Assigns a score to each BaseLink in each linkedSpan's set of N best
-   * matches. Currently the scoring indicates the probability that the toponym
-   * is correct based on the country context in the document
-   *
-   * @param linkedData the linked spans, holds the Namefinder results, and the
-   * list of BaseLink for each
-   * @param countryHits all the country mentions in the document
-   * @param nameCodesMap maps a country indicator name to a country code. Used
-   * to determine if the namefinder found the same exact toponym the country
-   * context did. If so the score is boosted due to the high probability that
-   * the NameFinder actually "rediscovered" a country
-   * @param docText the full text of the document...not used in this default
-   * implementation
-   * @param sentences the sentences that correspond to the doc text.
-   * @param maxAllowedDist a constant that is used to determine which country
-   * mentions, based on proximity within the text, should be used to score the
-   * Named Entity.
-   * @return
-   */
-  public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, 
Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String 
docText, Span[] sentences, Integer maxAllowedDist) {
-    this.nameCodesMap = nameCodesMap;
-    setDominantCode(countryHits);
-    for (LinkedSpan<BaseLink> linkedspan : linkedData) {
-
-      linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, 
maxAllowedDist);
-    }
-    return linkedData;
-  }
-
-  /**
-   * sets class level variable to a code based on the number of mentions
-   *
-   * @param countryHits
-   */
-  private void setDominantCode(Map<String, Set<Integer>> countryHits) {
-    int hits = -1;
-    for (String code : countryHits.keySet()) {
-      if (countryHits.get(code).size() > hits) {
-        hits = countryHits.get(code).size();
-        dominantCode = code;
-      }
-    }
-  }
-
-  /**
-   * Generates distances from each country mention to the span's location in 
the
-   * doc text. Ultimately an attempt to ensure that ambiguously named toponyms
-   * are resolved to the correct country and coordinate.
-   *
-   * @param sentences
-   * @param countryHits
-   * @param span
-   * @return
-   */
-  private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, 
Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer 
maxAllowedDistance) {
-    Double score = 0.0;
-    /*
-     * get the index of the actual span, begining of sentence //should generate
-     * tokens from sentence and create a char offset... //could have large
-     * sentences due to poor sentence detection or wonky doc text
-     */
-    int sentenceIdx = span.getSentenceid();
-    int sentIndexInDoc = sentences[sentenceIdx].getStart();
-    /**
-     * create a map of all the span's proximal country mentions in the document
-     * Map< countrycode, set of <distances from this NamedEntity>>
-     */
-    Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, 
Set<Integer>>();
-    //map = Map<countrycode, Set <of distances this span is from all the 
mentions of the code>>
-    for (String cCode : countryHits.keySet()) {
-//iterate over all the regex start values and calculate an offset
-      for (Integer cHit : countryHits.get(cCode)) {
-        Integer absDist = Math.abs(sentIndexInDoc - cHit);
-        //only include near mentions based on a heuristic
-        //TODO make this a property
-        //  if (absDist < maxAllowedDistance) {
-        if (distancesFromCodeMap.containsKey(cCode)) {
-          distancesFromCodeMap.get(cCode).add(absDist);
-        } else {
-          HashSet<Integer> newset = new HashSet<Integer>();
-          newset.add(absDist);
-          distancesFromCodeMap.put(cCode, newset);
-        }
-      }
-
-      //}
-    }
-    //we now know how far this named entity is from every country mention in 
the document
-
-    /**
-     * the gaz matches that have a country code that have mentions in the doc
-     * that are closest to the Named Entity should return the best score.
-     * Analyzemap generates a likelihood score that the toponym from the gaz is
-     * referring to one of the countries, i.e, Map<countrycode, prob that this
-     * span is referring to the toponym form this code key>
-     */
-    Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, 
span);
-    for (BaseLink link : span.getLinkedEntries()) {
-      //getItemParentId is the country code
-      String spanCountryCode = link.getItemParentID();
-      if (scoreMap.containsKey(spanCountryCode)) {
-
-        score = scoreMap.get(spanCountryCode);
-        ///does the name extracted match a country name?
-        if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || 
regexMatch(link.getItemName(), link.getItemParentID())) {
-          //if so, is it the correct country code for that name?
-          if 
(nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID()))
 {
-            //boost the score becuase it is likely that this is the location 
in the text, so add 50% to the score or set to 1
-            score = (score + .75) > 1.0 ? 1d : (score + .75);
-
-            if (link.getItemParentID().equals(dominantCode)) {
-              score = (score + .25) > 1.0 ? 1d : (score + .25);
-            }
-          }
-        }
-      }
-
-      link.getScoreMap().put("countrycontext", score);
-    }
-    return span;
-  }
-
-  /**
-   * takes a map of distances from the toponym to each country mention and
-   * generates a map of scores for each country code. The map is then 
correlated
-   * to the code of the BaseLink parentid for retrieval. Then the score is 
added
-   * to the overall list.
-   *
-   * @param distanceMap
-   * @param sentences
-   * @param span
-   * @return
-   */
-  private Map<String, Double> analyzeMap(Map<String, Set<Integer>> 
distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {
-
-    Map<String, Double> scoreMap = new HashMap<String, Double>();
-    if (distanceMap.isEmpty()) {
-      return scoreMap;
-    }
-    TreeSet<Integer> all = new TreeSet<Integer>();
-    for (String key : distanceMap.keySet()) {
-      all.addAll(distanceMap.get(key));
-    }
-    //get min max for normalization, this could be more efficient
-
-    Integer min = all.first();
-    Integer max = all.last();
-    if (min == max) {
-      min = 0;
-    }
-    for (String key : distanceMap.keySet()) {
-
-      TreeSet<Double> normalizedDistances = new TreeSet<Double>();
-      for (Integer i : distanceMap.get(key)) {
-        Double norm = normalize(i, min, max);
-        //reverse the normed distance so low numbers (closer) are better
-        //this could be improved with a "decaying " function using an 
imcreaseing negative exponent
-        Double reverse = Math.abs(norm - 1);
-        normalizedDistances.add(reverse);
-      }
-
-      List<Double> doubles = new ArrayList<Double>(normalizedDistances);
-      scoreMap.put(key, slidingDistanceAverage(doubles));
-    }
-    return scoreMap;
-  }
-
-  private boolean regexMatch(String placeName, String countryCode) {
-    if (regexMap.containsKey(countryCode)) {
-      String regexForCountry = regexMap.get(countryCode);
-
-      Pattern p = 
Pattern.compile(regexForCountry,Pattern.DOTALL|Pattern.CASE_INSENSITIVE);
-      return p.matcher(placeName.trim()).matches();
-    }
-    return false;
-  }
-
-  /**
-   * this method is an attempt to make closer clusters of mentions group
-   * together to smooth out the average, so one distant outlier does not kill
-   * the score for an obviously good hit. More elegant solution is possible
-   * using Math.pow, and making the score decay with distance by using an
-   * increasing negative exponent (I think)
-   *
-   * @param normDis the normalized and sorted set of distances as a list
-   * @return
-   */
-  private Double slidingDistanceAverage(List<Double> normDis) {
-    List<Double> windowOfAverages = new ArrayList<Double>();
-
-    if (normDis.size() < 3) {
-      windowOfAverages.addAll(normDis);
-    } else {
-
-      for (int i = 0; i < normDis.size() - 1; i++) {
-        double a = normDis.get(i);
-        double b = normDis.get(i + 1);
-        windowOfAverages.add((a + b) / 2);
-
-      }
-    }
-    double sum = 0d;
-    for (double d : windowOfAverages) {
-      sum += d;
-    }
-    double result = sum / windowOfAverages.size();
-    //TODO: ++ prob when large amounts of mentions for a code
-    //System.out.println("avg of window:" + result);
-    return result;
-  }
-
-  /**
-   * transposes a value within one range to a relative value in a different
-   * range. Used to normalize distances in this class.
-   *
-   * @param valueToNormalize the value to place within the new range
-   * @param minimum the min of the set to be transposed
-   * @param maximum the max of the set to be transposed
-   * @return
-   */
-  private Double normalize(int valueToNormalize, int minimum, int maximum) {
-    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - 
minimum) + 0;
-    d = d == null ? 0d : d;
-    return d;
-  }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.scoring;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Pattern;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Scores toponyms based on their proximity to a country mention. Based on the
+ * heuristic that toponymn mentions are more likely close to their parent
+ * country mentions. For instance, if the toponym Berlin is mentioned near an
+ * indicator of Germany, it is more likely to be Berlin Germany than Berlin
+ * Connecticut (if Connecticut is mentioned further down in the article).
+ *
+ *
+ */
+public class CountryProximityScorer implements 
LinkedEntityScorer<AdminBoundaryContext> {
+
+  private Map<String, Set<String>> nameCodesMap;
+  String dominantCode = "";
+  private Map<String, String> regexMap = new HashMap<>();
+
+  @Override
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext 
additionalContext) {
+
+    regexMap = additionalContext.getCountryRegexMap();
+    score(linkedSpans, additionalContext.getCountryMentions(), 
additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
+
+  }
+
+  /**
+   * Assigns a score to each BaseLink in each linkedSpan's set of N best
+   * matches. Currently the scoring indicates the probability that the toponym
+   * is correct based on the country context in the document
+   *
+   * @param linkedData the linked spans, holds the Namefinder results, and the
+   * list of BaseLink for each
+   * @param countryHits all the country mentions in the document
+   * @param nameCodesMap maps a country indicator name to a country code. Used
+   * to determine if the namefinder found the same exact toponym the country
+   * context did. If so the score is boosted due to the high probability that
+   * the NameFinder actually "rediscovered" a country
+   * @param docText the full text of the document...not used in this default
+   * implementation
+   * @param sentences the sentences that correspond to the doc text.
+   * @param maxAllowedDist a constant that is used to determine which country
+   * mentions, based on proximity within the text, should be used to score the
+   * Named Entity.
+   * @return
+   */
+  public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, 
Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String 
docText, Span[] sentences, Integer maxAllowedDist) {
+    this.nameCodesMap = nameCodesMap;
+    setDominantCode(countryHits);
+    for (LinkedSpan<BaseLink> linkedspan : linkedData) {
+
+      linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, 
maxAllowedDist);
+    }
+    return linkedData;
+  }
+
+  /**
+   * sets class level variable to a code based on the number of mentions
+   *
+   * @param countryHits
+   */
+  private void setDominantCode(Map<String, Set<Integer>> countryHits) {
+    int hits = -1;
+    for (String code : countryHits.keySet()) {
+      if (countryHits.get(code).size() > hits) {
+        hits = countryHits.get(code).size();
+        dominantCode = code;
+      }
+    }
+  }
+
+  /**
+   * Generates distances from each country mention to the span's location in 
the
+   * doc text. Ultimately an attempt to ensure that ambiguously named toponyms
+   * are resolved to the correct country and coordinate.
+   *
+   * @param sentences
+   * @param countryHits
+   * @param span
+   * @return
+   */
+  private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, 
Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer 
maxAllowedDistance) {
+    Double score = 0.0;
+    /*
+     * get the index of the actual span, begining of sentence //should generate
+     * tokens from sentence and create a char offset... //could have large
+     * sentences due to poor sentence detection or wonky doc text
+     */
+    int sentenceIdx = span.getSentenceid();
+    int sentIndexInDoc = sentences[sentenceIdx].getStart();
+    /**
+     * create a map of all the span's proximal country mentions in the document
+     * Map< countrycode, set of <distances from this NamedEntity>>
+     */
+    Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, 
Set<Integer>>();
+    //map = Map<countrycode, Set <of distances this span is from all the 
mentions of the code>>
+    for (String cCode : countryHits.keySet()) {
+//iterate over all the regex start values and calculate an offset
+      for (Integer cHit : countryHits.get(cCode)) {
+        Integer absDist = Math.abs(sentIndexInDoc - cHit);
+        //only include near mentions based on a heuristic
+        //TODO make this a property
+        //  if (absDist < maxAllowedDistance) {
+        if (distancesFromCodeMap.containsKey(cCode)) {
+          distancesFromCodeMap.get(cCode).add(absDist);
+        } else {
+          HashSet<Integer> newset = new HashSet<Integer>();
+          newset.add(absDist);
+          distancesFromCodeMap.put(cCode, newset);
+        }
+      }
+
+      //}
+    }
+    //we now know how far this named entity is from every country mention in 
the document
+
+    /**
+     * the gaz matches that have a country code that have mentions in the doc
+     * that are closest to the Named Entity should return the best score.
+     * Analyzemap generates a likelihood score that the toponym from the gaz is
+     * referring to one of the countries, i.e, Map<countrycode, prob that this
+     * span is referring to the toponym form this code key>
+     */
+    Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, 
span);
+    for (BaseLink link : span.getLinkedEntries()) {
+      //getItemParentId is the country code
+      String spanCountryCode = link.getItemParentID();
+      if (scoreMap.containsKey(spanCountryCode)) {
+
+        score = scoreMap.get(spanCountryCode);
+        ///does the name extracted match a country name?
+        if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || 
regexMatch(link.getItemName(), link.getItemParentID())) {
+          //if so, is it the correct country code for that name?
+          if 
(nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID()))
 {
+            //boost the score becuase it is likely that this is the location 
in the text, so add 50% to the score or set to 1
+            score = (score + .75) > 1.0 ? 1d : (score + .75);
+
+            if (link.getItemParentID().equals(dominantCode)) {
+              score = (score + .25) > 1.0 ? 1d : (score + .25);
+            }
+          }
+        }
+      }
+
+      link.getScoreMap().put("countrycontext", score);
+    }
+    return span;
+  }
+
+  /**
+   * takes a map of distances from the toponym to each country mention and
+   * generates a map of scores for each country code. The map is then 
correlated
+   * to the code of the BaseLink parentid for retrieval. Then the score is 
added
+   * to the overall list.
+   *
+   * @param distanceMap
+   * @param sentences
+   * @param span
+   * @return
+   */
+  private Map<String, Double> analyzeMap(Map<String, Set<Integer>> 
distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {
+
+    Map<String, Double> scoreMap = new HashMap<String, Double>();
+    if (distanceMap.isEmpty()) {
+      return scoreMap;
+    }
+    TreeSet<Integer> all = new TreeSet<Integer>();
+    for (String key : distanceMap.keySet()) {
+      all.addAll(distanceMap.get(key));
+    }
+    //get min max for normalization, this could be more efficient
+
+    Integer min = all.first();
+    Integer max = all.last();
+    if (min == max) {
+      min = 0;
+    }
+    for (String key : distanceMap.keySet()) {
+
+      TreeSet<Double> normalizedDistances = new TreeSet<Double>();
+      for (Integer i : distanceMap.get(key)) {
+        Double norm = normalize(i, min, max);
+        //reverse the normed distance so low numbers (closer) are better
+        //this could be improved with a "decaying " function using an 
imcreaseing negative exponent
+        Double reverse = Math.abs(norm - 1);
+        normalizedDistances.add(reverse);
+      }
+
+      List<Double> doubles = new ArrayList<Double>(normalizedDistances);
+      scoreMap.put(key, slidingDistanceAverage(doubles));
+    }
+    return scoreMap;
+  }
+
+  private boolean regexMatch(String placeName, String countryCode) {
+    if (regexMap.containsKey(countryCode)) {
+      String regexForCountry = regexMap.get(countryCode);
+
+      Pattern p = 
Pattern.compile(regexForCountry,Pattern.DOTALL|Pattern.CASE_INSENSITIVE);
+      return p.matcher(placeName.trim()).matches();
+    }
+    return false;
+  }
+
+  /**
+   * this method is an attempt to make closer clusters of mentions group
+   * together to smooth out the average, so one distant outlier does not kill
+   * the score for an obviously good hit. More elegant solution is possible
+   * using Math.pow, and making the score decay with distance by using an
+   * increasing negative exponent (I think)
+   *
+   * @param normDis the normalized and sorted set of distances as a list
+   * @return
+   */
+  private Double slidingDistanceAverage(List<Double> normDis) {
+    List<Double> windowOfAverages = new ArrayList<Double>();
+
+    if (normDis.size() < 3) {
+      windowOfAverages.addAll(normDis);
+    } else {
+
+      for (int i = 0; i < normDis.size() - 1; i++) {
+        double a = normDis.get(i);
+        double b = normDis.get(i + 1);
+        windowOfAverages.add((a + b) / 2);
+
+      }
+    }
+    double sum = 0d;
+    for (double d : windowOfAverages) {
+      sum += d;
+    }
+    double result = sum / windowOfAverages.size();
+    //TODO: ++ prob when large amounts of mentions for a code
+    //System.out.println("avg of window:" + result);
+    return result;
+  }
+
+  /**
+   * transposes a value within one range to a relative value in a different
+   * range. Used to normalize distances in this class.
+   *
+   * @param valueToNormalize the value to place within the new range
+   * @param minimum the min of the set to be transposed
+   * @param maximum the max of the set to be transposed
+   * @return
+   */
+  private Double normalize(int valueToNormalize, int minimum, int maximum) {
+    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - 
minimum) + 0;
+    d = d == null ? 0d : d;
+    return d;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
index e9634d9..abe5438 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
@@ -1,123 +1,125 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.scoring;
-
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import opennlp.addons.geoentitylinker.AdminBoundaryContext;
-import opennlp.addons.geoentitylinker.GazetteerEntry;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.BaseLink;
-import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- *
- * Generates scores based on string comparisons levenstein and dice
- */
-public class FuzzyStringMatchScorer implements 
LinkedEntityScorer<AdminBoundaryContext> {
-
-  @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext 
additionalContext) {
-    for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
-      for (BaseLink link : linkedSpan.getLinkedEntries()) {
-        if (link instanceof GazetteerEntry) {
-          GazetteerEntry entry = (GazetteerEntry) link;
-          String hierarchy = entry.getHierarchy();
-          if (hierarchy != null) {
-            Double dice = 
getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), 
hierarchy.toLowerCase(), 2);
-            link.getScoreMap().put("hierarchydicecoef", dice);
-            Double ld = (double) 
getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), 
hierarchy.toLowerCase());
-            link.getScoreMap().put("hierarchylevenshtein", ld);
-          }
-          String placename = entry.getItemName().toLowerCase();
-           if (placename != null) {
-            Double dice = 
getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), placename, 2);
-            link.getScoreMap().put("placenamedicecoef", dice);
-            
-          }
-        }
-      }
-    }
-
-  }
-
-  /**
-   * Generates a score based on an overlap of nGrams between two strings using
-   * the DiceCoefficient technique.
-   *
-   * @param s1 first string
-   * @param s2 second string
-   * @param nGrams number of chars in each gram
-   * @return
-   */
-  public double getDiceCoefficient(String s1, String s2, int nGrams) {
-    if (s1.isEmpty() || s2.isEmpty()) {
-      return 0d;
-    }
-    List<String> s1Grams = new ArrayList<>();
-    List<String> s2Grams = new ArrayList<>();
-    String[] split1 = s1.split("[ ,]");
-    for (String token : split1) {
-      if (token.trim().equals("")) {
-        continue;
-      }
-      s1Grams.add(token);
-    }
-    String[] split2 = s2.split("[ ,]");
-    for (String token : split2) {
-      if (token.trim().equals("")) {
-        continue;
-      }
-      s2Grams.add(token);
-    }
-
-    Set<String> overlap = new HashSet<String>(s1Grams);
-    overlap.retainAll(s2Grams);
-    double totcombigrams = overlap.size();
-
-    return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());
-  }
-
-  private int minimum(int a, int b, int c) {
-    return Math.min(Math.min(a, b), c);
-  }
-
-  public int getLevenshteinDistance(CharSequence str1,
-          CharSequence str2) {
-    int[][] distance = new int[str1.length() + 1][str2.length() + 1];
-
-    for (int i = 0; i <= str1.length(); i++) {
-      distance[i][0] = i;
-    }
-    for (int j = 1; j <= str2.length(); j++) {
-      distance[0][j] = j;
-    }
-
-    for (int i = 1; i <= str1.length(); i++) {
-      for (int j = 1; j <= str2.length(); j++) {
-        distance[i][j] = minimum(
-                distance[i - 1][j] + 1,
-                distance[i][j - 1] + 1,
-                distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j 
- 1)) ? 0 : 1));
-      }
-    }
-
-    return distance[str1.length()][str2.length()];
-  }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.scoring;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ *
+ * Generates scores based on string comparisons levenstein and dice
+ */
+public class FuzzyStringMatchScorer implements 
LinkedEntityScorer<AdminBoundaryContext> {
+
+  @Override
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext 
additionalContext) {
+    for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
+      for (BaseLink link : linkedSpan.getLinkedEntries()) {
+        if (link instanceof GazetteerEntry) {
+          GazetteerEntry entry = (GazetteerEntry) link;
+          String hierarchy = entry.getHierarchy();
+          if (hierarchy != null) {
+            Double dice = 
getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), 
hierarchy.toLowerCase(), 2);
+            link.getScoreMap().put("hierarchydicecoef", dice);
+            Double ld = (double) 
getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), 
hierarchy.toLowerCase());
+            link.getScoreMap().put("hierarchylevenshtein", ld);
+          }
+          String placename = entry.getItemName().toLowerCase();
+           if (placename != null) {
+            Double dice = 
getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), placename, 2);
+            link.getScoreMap().put("placenamedicecoef", dice);
+            
+          }
+        }
+      }
+    }
+
+  }
+
+  /**
+   * Generates a score based on an overlap of nGrams between two strings using
+   * the DiceCoefficient technique.
+   *
+   * @param s1 first string
+   * @param s2 second string
+   * @param nGrams number of chars in each gram
+   * @return
+   */
+  public double getDiceCoefficient(String s1, String s2, int nGrams) {
+    if (s1.isEmpty() || s2.isEmpty()) {
+      return 0d;
+    }
+    List<String> s1Grams = new ArrayList<>();
+    List<String> s2Grams = new ArrayList<>();
+    String[] split1 = s1.split("[ ,]");
+    for (String token : split1) {
+      if (token.trim().equals("")) {
+        continue;
+      }
+      s1Grams.add(token);
+    }
+    String[] split2 = s2.split("[ ,]");
+    for (String token : split2) {
+      if (token.trim().equals("")) {
+        continue;
+      }
+      s2Grams.add(token);
+    }
+
+    Set<String> overlap = new HashSet<String>(s1Grams);
+    overlap.retainAll(s2Grams);
+    double totcombigrams = overlap.size();
+
+    return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());
+  }
+
+  private int minimum(int a, int b, int c) {
+    return Math.min(Math.min(a, b), c);
+  }
+
+  public int getLevenshteinDistance(CharSequence str1,
+          CharSequence str2) {
+    int[][] distance = new int[str1.length() + 1][str2.length() + 1];
+
+    for (int i = 0; i <= str1.length(); i++) {
+      distance[i][0] = i;
+    }
+    for (int j = 1; j <= str2.length(); j++) {
+      distance[0][j] = j;
+    }
+
+    for (int i = 1; i <= str1.length(); i++) {
+      for (int j = 1; j <= str2.length(); j++) {
+        distance[i][j] = minimum(
+                distance[i - 1][j] + 1,
+                distance[i][j - 1] + 1,
+                distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j 
- 1)) ? 0 : 1));
+      }
+    }
+
+    return distance[str1.length()][str2.length()];
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
index d3494e0..98bad74 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
@@ -1,62 +1,64 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.scoring;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import opennlp.addons.geoentitylinker.AdminBoundaryContext;
-import opennlp.addons.geoentitylinker.GazetteerEntry;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.BaseLink;
-import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- * Scores toponymns based on geographic point binning. Based on the heuristic
- * that docs are generally about a small amount of locations, so one can detect
- * outliers by finding those points that are not near the majority
- *
- */
-public class GeoHashBinningScorer implements 
LinkedEntityScorer<AdminBoundaryContext> {
-
-  private final PointClustering CLUSTERER = new PointClustering();
-  private int PRECISION = 3;
-
-  @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties,  AdminBoundaryContext 
additionalContext) {
-     //Map<Double, Double> latLongs = new HashMap<Double, Double>();
-    List<GazetteerEntry> allGazEntries = new ArrayList<>();
-
-    /**
-     * collect all the gaz entry references
-     */
-    for (LinkedSpan<BaseLink> ls : linkedSpans) {
-      for (BaseLink bl : ls.getLinkedEntries()) {
-        if (bl instanceof GazetteerEntry) {
-          allGazEntries.add((GazetteerEntry) bl);
-        }
-      }
-    }
-    /**
-     * use the point clustering to score each hit
-     */
-    Map<String, List<GazetteerEntry>> cluster = 
CLUSTERER.cluster(allGazEntries, PRECISION);
-    CLUSTERER.scoreClusters(cluster);
-
-  }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.scoring;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Scores toponymns based on geographic point binning. Based on the heuristic
+ * that docs are generally about a small amount of locations, so one can detect
+ * outliers by finding those points that are not near the majority
+ *
+ */
+public class GeoHashBinningScorer implements 
LinkedEntityScorer<AdminBoundaryContext> {
+
+  private final PointClustering CLUSTERER = new PointClustering();
+  private int PRECISION = 3;
+
+  @Override
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties,  AdminBoundaryContext 
additionalContext) {
+     //Map<Double, Double> latLongs = new HashMap<Double, Double>();
+    List<GazetteerEntry> allGazEntries = new ArrayList<>();
+
+    /**
+     * collect all the gaz entry references
+     */
+    for (LinkedSpan<BaseLink> ls : linkedSpans) {
+      for (BaseLink bl : ls.getLinkedEntries()) {
+        if (bl instanceof GazetteerEntry) {
+          allGazEntries.add((GazetteerEntry) bl);
+        }
+      }
+    }
+    /**
+     * use the point clustering to score each hit
+     */
+    Map<String, List<GazetteerEntry>> cluster = 
CLUSTERER.cluster(allGazEntries, PRECISION);
+    CLUSTERER.scoreClusters(cluster);
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
index 5fb9c5d..843d9b8 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
@@ -1,40 +1,42 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.scoring;
-
-import java.util.List;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- * Structure for scoring linked entities. The Map logically represents a pair :
- * "Score type" to the "actual Score."
- * @param <T> a generic for providing additional context
- */
-public interface LinkedEntityScorer<T> {
-
-/**
- * Scores a collection of linked entities. Implementations should populate the 
scoreMap in the list of BaseLink for each linkedSpan
- * this method internally affects the reference to linkedSpans that was passed 
in
- * @param linkedSpans the spans that have been linked to some external source 
and have all the data they need to be scored
- * @param docText the full text of the document.
- * @param sentenceSpans the sentence spans the correspond to the document text
-   * @param properties the entitylinker properties config file
- * @param additionalContext any additional data required to perform the 
scoring operation
- */
-  void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties, T additionalContext);
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.scoring;
+
+import java.util.List;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Structure for scoring linked entities. The Map logically represents a pair :
+ * "Score type" to the "actual Score."
+ * @param <T> a generic for providing additional context
+ */
+public interface LinkedEntityScorer<T> {
+
+/**
+ * Scores a collection of linked entities. Implementations should populate the 
scoreMap in the list of BaseLink for each linkedSpan
+ * this method internally affects the reference to linkedSpans that was passed 
in
+ * @param linkedSpans the spans that have been linked to some external source 
and have all the data they need to be scored
+ * @param docText the full text of the document.
+ * @param sentenceSpans the sentence spans the correspond to the document text
+   * @param properties the entitylinker properties config file
+ * @param additionalContext any additional data required to perform the 
scoring operation
+ */
+  void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties, T additionalContext);
+}

http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
index 01b3269..034c526 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
@@ -1,160 +1,163 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.scoring;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import opennlp.addons.geoentitylinker.AdminBoundaryContext;
-import opennlp.tools.doccat.DoccatModel;
-import opennlp.tools.doccat.DocumentCategorizerME;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.BaseLink;
-import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.util.Span;
-import org.apache.log4j.Logger;
-
-/**
- *
- * Utilizes a doccat model to score toponyms based on surrounding context
- */
-public class ModelBasedScorer implements 
LinkedEntityScorer<AdminBoundaryContext> {
-
-  private static final Logger LOGGER = 
Logger.getLogger(ModelBasedScorer.class);
-  DocumentCategorizerME documentCategorizerME;
-  DoccatModel doccatModel;
-  public static final int RADIUS = 200;
-  boolean modelexists = false;
-
-  @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext 
additionalContext) {
-    try {
-      if (doccatModel == null) {
-        String path = 
properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", 
"");
-        if (path.equals("")) {
-          return;
-        }
-        modelexists = true;
-        doccatModel = new DoccatModel(new File(path));
-        documentCategorizerME = new DocumentCategorizerME(doccatModel);
-      }
-      Map<Integer, String> proximalFeatures = 
generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);
-      for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) {
-        Map<String, Double> scores = this.getScore(entry.getValue());
-        for (BaseLink link : (List<BaseLink>) 
linkedSpans.get(entry.getKey()).getLinkedEntries()) {
-          double score = 0d;
-          if (scores.containsKey(link.getItemParentID())) {
-            score = scores.get(link.getItemParentID());
-          }
-          link.getScoreMap().put("countrymodel", score);
-        }
-      }
-
-    } catch (FileNotFoundException ex) {
-      LOGGER.error(ex);
-    } catch (IOException ex) {
-      LOGGER.error(ex);
-    } catch (Exception ex) {
-      LOGGER.error(ex);
-    }
-  }
-
-  /**
-   * generates features using a BagOfWordsfeatureGenerator that are within the
-   * radius of a mention within the doctext
-   *
-   * @param linkedSpans
-   * @param sentenceSpans
-   * @param docText
-   * @param radius
-   * @return a map of the index of the linked span to the string of surrounding
-   * text: Map&lt;indexofspan,surrounding text&gt;
-   */
-  public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> 
linkedSpans, Span[] sentenceSpans, String docText, int radius) {
-    Map<Integer, String> featureBags = new HashMap<>();
-    Map<Integer, Integer> nameMentionMap = new HashMap<>();
-    /**
-     * iterator over the map that contains a mapping of every country code to
-     * all of its mentions in the document
-     */
-    for (int i = 0; i < linkedSpans.size(); i++) {
-      LinkedSpan span = linkedSpans.get(i);
-      if (span.getLinkedEntries().isEmpty()) {
-        //don't care about spans that did not get linked to anything at all; 
nothing to work with
-        continue;
-      }
-      /**
-       * get the sentence the name span was found in, the beginning of the
-       * sentence will suffice as a centroid for feature generation around the
-       * named entity
-       */
-      Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart();
-      nameMentionMap.put(i, mentionIdx);
-    }
-    /**
-     * now associate each span to a string that will be used for categorization
-     * against the model.
-     */
-    for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) {
-      featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, 
radius));
-    }
-
-    return featureBags;
-  }
-
-  public String getTextChunk(int mentionIdx, String docText, int radius) {
-    int docSize = docText.length();
-    int left = 0, right = 0;
-    left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;
-    right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius;
-    String chunk = "";
-    if (right <= left) {
-      chunk = "";
-    } else {
-      /**
-       * don't want to chop any words in half, so take fron the first space to
-       * the last space in the chunk string
-       */
-      chunk = docText.substring(left, right);
-      if (left != 0) {
-        left = chunk.indexOf(" ");
-      }
-      right = chunk.lastIndexOf(" ");
-      /**
-       * now get the substring again with only whole words
-       */
-      if (left < right) {
-        chunk = chunk.substring(left, right);
-      }
-    }
-
-    return chunk;
-  }
-
-  private Map<String, Double> getScore(String text) throws Exception {
-    Map<String, Double> scoreMap = new HashMap<>();
-    double[] categorize = documentCategorizerME.categorize(text);
-    int catSize = documentCategorizerME.getNumberOfCategories();
-    for (int i = 0; i < catSize; i++) {
-      String category = documentCategorizerME.getCategory(i);
-      scoreMap.put(category, 
categorize[documentCategorizerME.getIndex(category)]);
-    }
-    return scoreMap;
-  }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.scoring;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.tools.doccat.DoccatModel;
+import opennlp.tools.doccat.DocumentCategorizerME;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Utilizes a doccat model to score toponyms based on surrounding context
+ */
+public class ModelBasedScorer implements 
LinkedEntityScorer<AdminBoundaryContext> {
+
+  private static final Logger LOGGER = 
Logger.getLogger(ModelBasedScorer.class);
+  DocumentCategorizerME documentCategorizerME;
+  DoccatModel doccatModel;
+  public static final int RADIUS = 200;
+  boolean modelexists = false;
+
+  @Override
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] 
sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext 
additionalContext) {
+    try {
+      if (doccatModel == null) {
+        String path = 
properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", 
"");
+        if (path.equals("")) {
+          return;
+        }
+        modelexists = true;
+        doccatModel = new DoccatModel(new File(path));
+        documentCategorizerME = new DocumentCategorizerME(doccatModel);
+      }
+      Map<Integer, String> proximalFeatures = 
generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);
+      for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) {
+        Map<String, Double> scores = this.getScore(entry.getValue());
+        for (BaseLink link : (List<BaseLink>) 
linkedSpans.get(entry.getKey()).getLinkedEntries()) {
+          double score = 0d;
+          if (scores.containsKey(link.getItemParentID())) {
+            score = scores.get(link.getItemParentID());
+          }
+          link.getScoreMap().put("countrymodel", score);
+        }
+      }
+
+    } catch (FileNotFoundException ex) {
+      LOGGER.error(ex);
+    } catch (IOException ex) {
+      LOGGER.error(ex);
+    } catch (Exception ex) {
+      LOGGER.error(ex);
+    }
+  }
+
+  /**
+   * generates features using a BagOfWordsfeatureGenerator that are within the
+   * radius of a mention within the doctext
+   *
+   * @param linkedSpans
+   * @param sentenceSpans
+   * @param docText
+   * @param radius
+   * @return a map of the index of the linked span to the string of surrounding
+   * text: Map&lt;indexofspan,surrounding text&gt;
+   */
+  public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> 
linkedSpans, Span[] sentenceSpans, String docText, int radius) {
+    Map<Integer, String> featureBags = new HashMap<>();
+    Map<Integer, Integer> nameMentionMap = new HashMap<>();
+    /**
+     * iterator over the map that contains a mapping of every country code to
+     * all of its mentions in the document
+     */
+    for (int i = 0; i < linkedSpans.size(); i++) {
+      LinkedSpan span = linkedSpans.get(i);
+      if (span.getLinkedEntries().isEmpty()) {
+        //don't care about spans that did not get linked to anything at all; 
nothing to work with
+        continue;
+      }
+      /**
+       * get the sentence the name span was found in, the beginning of the
+       * sentence will suffice as a centroid for feature generation around the
+       * named entity
+       */
+      Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart();
+      nameMentionMap.put(i, mentionIdx);
+    }
+    /**
+     * now associate each span to a string that will be used for categorization
+     * against the model.
+     */
+    for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) {
+      featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, 
radius));
+    }
+
+    return featureBags;
+  }
+
+  public String getTextChunk(int mentionIdx, String docText, int radius) {
+    int docSize = docText.length();
+    int left = 0, right = 0;
+    left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;
+    right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius;
+    String chunk = "";
+    if (right <= left) {
+      chunk = "";
+    } else {
+      /**
+       * don't want to chop any words in half, so take fron the first space to
+       * the last space in the chunk string
+       */
+      chunk = docText.substring(left, right);
+      if (left != 0) {
+        left = chunk.indexOf(" ");
+      }
+      right = chunk.lastIndexOf(" ");
+      /**
+       * now get the substring again with only whole words
+       */
+      if (left < right) {
+        chunk = chunk.substring(left, right);
+      }
+    }
+
+    return chunk;
+  }
+
+  private Map<String, Double> getScore(String text) throws Exception {
+    Map<String, Double> scoreMap = new HashMap<>();
+    double[] categorize = documentCategorizerME.categorize(text);
+    int catSize = documentCategorizerME.getNumberOfCategories();
+    for (int i = 0; i < catSize; i++) {
+      String category = documentCategorizerME.getCategory(i);
+      scoreMap.put(category, 
categorize[documentCategorizerME.getIndex(category)]);
+    }
+    return scoreMap;
+  }
+}

[2/4] opennlp-addons git commit: Fix checkstyle errors in geoentitylinker

Reply via email to