[3/4] opennlp-addons git commit: Fix checkstyle errors in geoentitylinker

joern Mon, 24 Apr 2017 06:21:08 -0700

http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index e18253d..bb435e2 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -1,262 +1,264 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.logging.Level;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexableField;
-import org.apache.lucene.queryparser.classic.ParseException;
-
-import org.apache.lucene.queryparser.classic.QueryParser;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.MMapDirectory;
-import org.apache.lucene.util.Version;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import org.apache.log4j.Logger;
-import org.apache.lucene.analysis.core.KeywordAnalyzer;
-import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
-import org.apache.lucene.analysis.util.CharArraySet;
-
-/**
- *
- * Searches Gazetteers stored in a MMapDirectory Lucene index. The structure of
- * these indices are based on loading the indexes using the GazetteerIndexer
- *
- */
-public class GazetteerSearcher {
-
-  private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";
-  private static final Logger LOGGER = 
Logger.getLogger(GazetteerSearcher.class);
-  private double scoreCutoff = .70;
-  private boolean doubleQuoteAllSearchTerms = false;
-  private boolean useHierarchyField = false;
-
-  private EntityLinkerProperties properties;
-
-  private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc));
-  private IndexReader opennlpReader;// = DirectoryReader.open(geonamesIndex);
-  private IndexSearcher opennlpSearcher;// = new IndexSearcher(geonamesReader);
-  private Analyzer opennlpAnalyzer;
-
-  public static void main(String[] args) {
-    try {
-      boolean b = Boolean.valueOf("true");
-
-      new GazetteerSearcher(new EntityLinkerProperties(new 
File("c:\\temp\\entitylinker.properties"))).find("alabama", 5, " countrycode:us 
AND gazsource:usgs");
-    } catch (IOException ex) {
-      
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE,
 null, ex);
-    } catch (Exception ex) {
-      
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE,
 null, ex);
-    }
-  }
-
-  public GazetteerSearcher(EntityLinkerProperties properties) throws 
IOException {
-    this.properties = properties;
-    init();
-  }
-
-  /**
-   * Searches the single lucene index that includes the location hierarchy.
-   *
-   * @param searchString the location name to search for
-   * @param rowsReturned how many index entries to return (top N...)
-   * @param whereClause the conditional statement that defines the index type
-   * and the country oode.
-   * @return
-   */
-  public ArrayList<GazetteerEntry> find(String searchString, int rowsReturned, 
String whereClause) {
-    ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
-    searchString = cleanInput(searchString);
-    if (searchString.isEmpty()) {
-      return linkedData;
-    }
-    try {
-      /**
-       * build the search string Sometimes no country context is found. In this
-       * case the code variables will be empty strings
-       */
-      String placeNameQueryString = "placename:(" + searchString.toLowerCase() 
+ ") " + "AND " + whereClause;
-      if (searchString.trim().contains(" ") && useHierarchyField) {
-        placeNameQueryString = "(placename:(" + searchString.toLowerCase() + 
") AND hierarchy:(" + formatForHierarchy(searchString) + "))"
-            + " AND " + whereClause;
-      }
-
-      /**
-       * check the cache and go no further if the records already exist
-       */
-      ArrayList<GazetteerEntry> get = 
GazetteerSearchCache.get(placeNameQueryString);
-      if (get != null) {
-
-        return get;
-      }
-      /**
-       * search the placename
-       */
-      QueryParser parser = new QueryParser(placeNameQueryString, 
opennlpAnalyzer);
-      Query q = parser.parse(placeNameQueryString);
-      //Filter filter = new QueryWrapperFilter(new 
QueryParser(Version.LUCENE_48, whereClause, 
opennlpAnalyzer).parse(whereClause));      
-
-      TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned);
-      Double maxscore = 0d;
-      for (int i = 0; i < bestDocs.scoreDocs.length; ++i) {
-        GazetteerEntry entry = new GazetteerEntry();
-        int docId = bestDocs.scoreDocs[i].doc;
-        double sc = bestDocs.scoreDocs[i].score;
-        if (maxscore.compareTo(sc) < 0) {
-          maxscore = sc;
-        }
-        entry.getScoreMap().put("lucene", sc);
-        entry.setIndexID(docId + "");
-
-        Document d = opennlpSearcher.doc(docId);
-
-        List<IndexableField> fields = d.getFields();
-
-        String lat = d.get("latitude");
-        String lon = d.get("longitude");
-        String placename = d.get("placename");
-        String parentid = d.get("countrycode").toLowerCase();
-        String provid = d.get("admincode");
-        String itemtype = d.get("loctype");
-        String source = d.get("gazsource");
-        String hier = d.get("hierarchy");
-        entry.setSource(source);
-
-        entry.setItemID(docId + "");
-        entry.setLatitude(Double.valueOf(lat));
-        entry.setLongitude(Double.valueOf(lon));
-        entry.setItemType(itemtype);
-        entry.setItemParentID(parentid);
-        entry.setProvinceCode(provid);
-        entry.setCountryCode(parentid);
-        entry.setItemName(placename);
-        entry.setHierarchy(hier);
-        for (int idx = 0; idx < fields.size(); idx++) {
-          entry.getIndexData().put(fields.get(idx).name(), 
d.get(fields.get(idx).name()));
-        }
-
-        /**
-         * only want hits above the levenstein thresh. This should be a low
-         * thresh due to the use of the hierarchy field in the index
-         */
-        // if (normLev > scoreCutoff) {
-        if 
(entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || 
parentid.toLowerCase().equals("")) {
-          //make sure we don't produce a duplicate
-          if (!linkedData.contains(entry)) {
-            linkedData.add(entry);
-            /**
-             * add the records to the cache for this query
-             */
-            GazetteerSearchCache.put(placeNameQueryString, linkedData);
-          }
-        }
-        //}
-      }
-
-    } catch (IOException | ParseException ex) {
-      LOGGER.error(ex);
-    }
-
-    return linkedData;
-  }
-
-  /**
-   * Replaces any noise chars with a space, and depending on configuration adds
-   * double quotes to the string
-   *
-   * @param input
-   * @return
-   */
-  private String cleanInput(String input) {
-    String output = input.replaceAll(REGEX_CLEAN, " ").trim();
-    output = output.replace("  ", " ");
-    if (doubleQuoteAllSearchTerms) {
-      return "\"" + output + "\"";
-    } else {
-      return output;
-    }
-
-  }
-
-  private void init() throws IOException {
-
-    if (opennlpIndex == null) {
-      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", 
"");
-      if (indexloc.equals("")) {
-        LOGGER.error(new Exception("Opennlp combined Gaz directory location 
not found"));
-
-      }
-
-      opennlpIndex = new MMapDirectory(Paths.get(indexloc));
-      opennlpReader = DirectoryReader.open(opennlpIndex);
-      opennlpSearcher = new IndexSearcher(opennlpReader);
-      opennlpAnalyzer
-          = //new StandardAnalyzer(Version.LUCENE_48, new 
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
-          new StandardAnalyzer(new CharArraySet(new ArrayList(), true));
-      Map<String, Analyzer> analyMap = new HashMap<>();
-
-      analyMap.put("countrycode", new KeywordAnalyzer());
-      analyMap.put("admincode", new KeywordAnalyzer());
-      analyMap.put("loctype", new KeywordAnalyzer());
-      analyMap.put("countycode", new KeywordAnalyzer());
-      analyMap.put("gazsource", new KeywordAnalyzer());
-
-      opennlpAnalyzer
-          = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap);
-
-      String cutoff = 
properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", 
String.valueOf(scoreCutoff));
-      String usehierarchy = 
properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", 
String.valueOf("0"));
-      if (cutoff != null && !cutoff.isEmpty()) {
-        scoreCutoff = Double.valueOf(cutoff);
-      }
-      if (usehierarchy != null && !usehierarchy.isEmpty()) {
-        useHierarchyField = Boolean.valueOf(usehierarchy);
-      }
-      //  opennlp.geoentitylinker.gaz.doublequote=false
-      //opennlp.geoentitylinker.gaz.hierarchyfield=false
-
-    }
-  }
-
-  private String formatForHierarchy(String searchTerm) {
-    String[] parts = cleanInput(searchTerm).split(" ");
-    String out = "";
-    if (parts.length != 0) {
-      for (String string : parts) {
-        out += string + " AND ";
-      }
-      out = out.substring(0, out.lastIndexOf(" AND "));
-    } else {
-      out = cleanInput(searchTerm);
-    }
-    return out;
-  }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+
+import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
+
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+
+/**
+ *
+ * Searches Gazetteers stored in a MMapDirectory Lucene index. The structure of
+ * these indices are based on loading the indexes using the GazetteerIndexer
+ *
+ */
+public class GazetteerSearcher {
+
+  private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";
+  private static final Logger LOGGER = 
Logger.getLogger(GazetteerSearcher.class);
+  private double scoreCutoff = .70;
+  private boolean doubleQuoteAllSearchTerms = false;
+  private boolean useHierarchyField = false;
+
+  private EntityLinkerProperties properties;
+
+  private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc));
+  private IndexReader opennlpReader;// = DirectoryReader.open(geonamesIndex);
+  private IndexSearcher opennlpSearcher;// = new IndexSearcher(geonamesReader);
+  private Analyzer opennlpAnalyzer;
+
+  public static void main(String[] args) {
+    try {
+      boolean b = Boolean.valueOf("true");
+
+      new GazetteerSearcher(new EntityLinkerProperties(new 
File("c:\\temp\\entitylinker.properties"))).find("alabama", 5, " countrycode:us 
AND gazsource:usgs");
+    } catch (IOException ex) {
+      
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE,
 null, ex);
+    } catch (Exception ex) {
+      
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE,
 null, ex);
+    }
+  }
+
+  public GazetteerSearcher(EntityLinkerProperties properties) throws 
IOException {
+    this.properties = properties;
+    init();
+  }
+
+  /**
+   * Searches the single lucene index that includes the location hierarchy.
+   *
+   * @param searchString the location name to search for
+   * @param rowsReturned how many index entries to return (top N...)
+   * @param whereClause the conditional statement that defines the index type
+   * and the country oode.
+   * @return
+   */
+  public ArrayList<GazetteerEntry> find(String searchString, int rowsReturned, 
String whereClause) {
+    ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
+    searchString = cleanInput(searchString);
+    if (searchString.isEmpty()) {
+      return linkedData;
+    }
+    try {
+      /**
+       * build the search string Sometimes no country context is found. In this
+       * case the code variables will be empty strings
+       */
+      String placeNameQueryString = "placename:(" + searchString.toLowerCase() 
+ ") " + "AND " + whereClause;
+      if (searchString.trim().contains(" ") && useHierarchyField) {
+        placeNameQueryString = "(placename:(" + searchString.toLowerCase() + 
") AND hierarchy:(" + formatForHierarchy(searchString) + "))"
+            + " AND " + whereClause;
+      }
+
+      /**
+       * check the cache and go no further if the records already exist
+       */
+      ArrayList<GazetteerEntry> get = 
GazetteerSearchCache.get(placeNameQueryString);
+      if (get != null) {
+
+        return get;
+      }
+      /**
+       * search the placename
+       */
+      QueryParser parser = new QueryParser(placeNameQueryString, 
opennlpAnalyzer);
+      Query q = parser.parse(placeNameQueryString);
+      //Filter filter = new QueryWrapperFilter(new 
QueryParser(Version.LUCENE_48, whereClause, 
opennlpAnalyzer).parse(whereClause));      
+
+      TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned);
+      Double maxscore = 0d;
+      for (int i = 0; i < bestDocs.scoreDocs.length; ++i) {
+        GazetteerEntry entry = new GazetteerEntry();
+        int docId = bestDocs.scoreDocs[i].doc;
+        double sc = bestDocs.scoreDocs[i].score;
+        if (maxscore.compareTo(sc) < 0) {
+          maxscore = sc;
+        }
+        entry.getScoreMap().put("lucene", sc);
+        entry.setIndexID(docId + "");
+
+        Document d = opennlpSearcher.doc(docId);
+
+        List<IndexableField> fields = d.getFields();
+
+        String lat = d.get("latitude");
+        String lon = d.get("longitude");
+        String placename = d.get("placename");
+        String parentid = d.get("countrycode").toLowerCase();
+        String provid = d.get("admincode");
+        String itemtype = d.get("loctype");
+        String source = d.get("gazsource");
+        String hier = d.get("hierarchy");
+        entry.setSource(source);
+
+        entry.setItemID(docId + "");
+        entry.setLatitude(Double.valueOf(lat));
+        entry.setLongitude(Double.valueOf(lon));
+        entry.setItemType(itemtype);
+        entry.setItemParentID(parentid);
+        entry.setProvinceCode(provid);
+        entry.setCountryCode(parentid);
+        entry.setItemName(placename);
+        entry.setHierarchy(hier);
+        for (int idx = 0; idx < fields.size(); idx++) {
+          entry.getIndexData().put(fields.get(idx).name(), 
d.get(fields.get(idx).name()));
+        }
+
+        /**
+         * only want hits above the levenstein thresh. This should be a low
+         * thresh due to the use of the hierarchy field in the index
+         */
+        // if (normLev > scoreCutoff) {
+        if 
(entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || 
parentid.toLowerCase().equals("")) {
+          //make sure we don't produce a duplicate
+          if (!linkedData.contains(entry)) {
+            linkedData.add(entry);
+            /**
+             * add the records to the cache for this query
+             */
+            GazetteerSearchCache.put(placeNameQueryString, linkedData);
+          }
+        }
+        //}
+      }
+
+    } catch (IOException | ParseException ex) {
+      LOGGER.error(ex);
+    }
+
+    return linkedData;
+  }
+
+  /**
+   * Replaces any noise chars with a space, and depending on configuration adds
+   * double quotes to the string
+   *
+   * @param input
+   * @return
+   */
+  private String cleanInput(String input) {
+    String output = input.replaceAll(REGEX_CLEAN, " ").trim();
+    output = output.replace("  ", " ");
+    if (doubleQuoteAllSearchTerms) {
+      return "\"" + output + "\"";
+    } else {
+      return output;
+    }
+
+  }
+
+  private void init() throws IOException {
+
+    if (opennlpIndex == null) {
+      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", 
"");
+      if (indexloc.equals("")) {
+        LOGGER.error(new Exception("Opennlp combined Gaz directory location 
not found"));
+
+      }
+
+      opennlpIndex = new MMapDirectory(Paths.get(indexloc));
+      opennlpReader = DirectoryReader.open(opennlpIndex);
+      opennlpSearcher = new IndexSearcher(opennlpReader);
+      opennlpAnalyzer
+          = //new StandardAnalyzer(Version.LUCENE_48, new 
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+          new StandardAnalyzer(new CharArraySet(new ArrayList(), true));
+      Map<String, Analyzer> analyMap = new HashMap<>();
+
+      analyMap.put("countrycode", new KeywordAnalyzer());
+      analyMap.put("admincode", new KeywordAnalyzer());
+      analyMap.put("loctype", new KeywordAnalyzer());
+      analyMap.put("countycode", new KeywordAnalyzer());
+      analyMap.put("gazsource", new KeywordAnalyzer());
+
+      opennlpAnalyzer
+          = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap);
+
+      String cutoff = 
properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", 
String.valueOf(scoreCutoff));
+      String usehierarchy = 
properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", 
String.valueOf("0"));
+      if (cutoff != null && !cutoff.isEmpty()) {
+        scoreCutoff = Double.valueOf(cutoff);
+      }
+      if (usehierarchy != null && !usehierarchy.isEmpty()) {
+        useHierarchyField = Boolean.valueOf(usehierarchy);
+      }
+      //  opennlp.geoentitylinker.gaz.doublequote=false
+      //opennlp.geoentitylinker.gaz.hierarchyfield=false
+
+    }
+  }
+
+  private String formatForHierarchy(String searchTerm) {
+    String[] parts = cleanInput(searchTerm).split(" ");
+    String out = "";
+    if (parts.length != 0) {
+      for (String string : parts) {
+        out += string + " AND ";
+      }
+      out = out.substring(0, out.lastIndexOf(" AND "));
+    } else {
+      out = cleanInput(searchTerm);
+    }
+    return out;
+  }
+
+}


http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index 43be5d5..f2f3d0f 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -1,223 +1,228 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker;
-
-import java.io.IOException;
-import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;
-import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer;
-import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer;
-import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer;
-import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer;
-import opennlp.addons.geoentitylinker.scoring.ProvinceProximityScorer;
-import opennlp.tools.entitylinker.BaseLink;
-import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.util.Span;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.EntityLinker;
-
-/**
- * Links location entities to the USGS and GeoNames gazetteers, and uses 
several
- * scoring techniques to enable resolution. The gazetteers are stored in lucene
- * indexes. The indexes can be built using the GeoEntityLinkerSetupUtils class
- * in this same package.
- */
-public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
-
-  private static Integer topN = 2;
-  private AdminBoundaryContextGenerator countryContext;
-  private EntityLinkerProperties linkerProperties;
-  private GazetteerSearcher gazateerSearcher;
-  private List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new 
ArrayList<>();
-
-  @Override
-  public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] 
tokensBySentence, Span[][] namesBySentence) {
-    ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
-
-    if (linkerProperties == null) {
-      throw new IllegalArgumentException("EntityLinkerProperties cannot be 
null");
-    }
-    //countryMentions = countryContext.regexfind(doctext);
-    AdminBoundaryContext context = countryContext.getContext(doctext);
-    for (int s = 0; s < sentences.length; s++) {
-      Span[] names = namesBySentence[s];
-
-      Span[] tokenSpans = tokensBySentence[s];
-      String[] tokens = Span.spansToStrings(tokenSpans, 
sentences[s].getCoveredText(doctext));
-
-      String[] matches = Span.spansToStrings(names, tokens);
-
-      for (int i = 0; i < matches.length; i++) {
-
-        ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
-        if (!context.getWhereClauses().isEmpty()) {
-          for (String whereclause : context.getWhereClauses()) {
-            ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], 
topN, whereclause);
-            for (GazetteerEntry gazetteerEntry : find) {
-              if (!geoNamesEntries.contains(gazetteerEntry)) {
-                geoNamesEntries.add(gazetteerEntry);
-              }
-            }
-
-          }
-        } else {//this means there were no where clauses generated so the 
where clause will default to look at the entire index
-          ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], 
topN, " gaztype:usgs geonames regions ");
-          for (GazetteerEntry gazetteerEntry : find) {
-            if (!geoNamesEntries.contains(gazetteerEntry)) {
-              geoNamesEntries.add(gazetteerEntry);
-            }
-          }
-        }
-        if (geoNamesEntries.isEmpty()) {
-          continue;
-        }
-        /**
-         * Normalize the returned scores for this name... this will assist the
-         * sort
-         */
-        if (!spans.isEmpty()) {
-
-          Double maxscore = 0d;
-          for (BaseLink gazetteerEntry : geoNamesEntries) {
-            Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
-            if (deNormScore.compareTo(maxscore) > 0) {
-              maxscore = deNormScore;
-            }
-          }
-          for (BaseLink gazetteerEntry : geoNamesEntries) {
-            Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
-            Double normalize = normalize(deNormScore, 0d, maxscore);
-            gazetteerEntry.getScoreMap().put("normlucene", normalize);
-          }
-        }
-        LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0);
-        newspan.setSearchTerm(matches[i]);
-        newspan.setLinkedEntries(geoNamesEntries);
-        newspan.setSentenceid(s);
-        spans.add(newspan);
-      }
-
-    }
-
-    if (!scorers.isEmpty()) {
-      for (LinkedEntityScorer scorer : scorers) {
-        scorer.score(spans, doctext, sentences, linkerProperties, context);
-      }
-    }
-    /**
-     * sort the data with the best score on top based on the sum of the scores
-     * below from the score map for each baselink object
-     */
-    for (LinkedSpan<BaseLink> s : spans) {
-      ArrayList<BaseLink> linkedData = s.getLinkedEntries();
-      Collections.sort(linkedData, Collections.reverseOrder(new 
Comparator<BaseLink>() {
-        @Override
-        public int compare(BaseLink o1, BaseLink o2) {
-          HashMap<String, Double> o1scoreMap = o1.getScoreMap();
-          HashMap<String, Double> o2scoreMap = o2.getScoreMap();
-          if (o1scoreMap.size() != o2scoreMap.size()) {
-            return 0;
-          }
-          double sumo1 = 0d;
-          double sumo2 = 0d;
-          for (String object : o1scoreMap.keySet()) {
-            if (object.equals("typescore")
-                || object.equals("countrycontext")
-                || object.equals("placenamedicecoef")
-                || object.equals("provincecontext")
-                || object.equals("geohashbin")
-                || object.equals("normlucene")) {
-              sumo1 += o1scoreMap.get(object);
-              sumo2 += o2scoreMap.get(object);
-            }
-          }
-
-          return Double.compare(sumo1,
-              sumo2);
-        }
-      }));
-      //prune the list to topN
-      Iterator iterator = linkedData.iterator();
-      int n = 0;
-      while (iterator.hasNext()) {
-        if (n >= topN) {
-          iterator.remove();
-        }
-        iterator.next();
-        n++;
-      }
-    }
-
-    return spans;
-  }
-
-  /**
-   * transposes a value within one range to a relative value in a different
-   * range. Used to normalize distances in this class.
-   *
-   * @param valueToNormalize the value to place within the new range
-   * @param minimum the min of the set to be transposed
-   * @param maximum the max of the set to be transposed
-   * @return
-   */
-  private Double normalize(Double valueToNormalize, Double minimum, Double 
maximum) {
-    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - 
minimum) + 0;
-    d = d == Double.NaN ? 0d : d;
-    return d;
-  }
-
-  private void loadScorers() {
-    if (scorers.isEmpty()) {
-      scorers.add(new ProvinceProximityScorer());
-      scorers.add(new GeoHashBinningScorer());
-      scorers.add(new CountryProximityScorer());
-      scorers.add(new ModelBasedScorer());
-      scorers.add(new FuzzyStringMatchScorer());
-      scorers.add(new PlacetypeScorer());
-    }
-  }
-
-  @Override
-  public void init(EntityLinkerProperties properties) throws IOException {
-
-    this.linkerProperties = properties;
-    countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);
-    gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
-    String rowsRetStr = 
this.linkerProperties.getProperty("opennlp.geoentitylinker.gaz.rowsreturned", 
"2");
-    Integer rws = 2;
-    try {
-      rws = Integer.valueOf(rowsRetStr);
-    } catch (NumberFormatException e) {
-      rws = 2;
-    }
-    topN = rws;
-    loadScorers();
-
-  }
-
-  @Override
-  public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] 
tokensBySentence,
-      Span[][] namesBySentence, int sentenceIndex) {
-    throw new UnsupportedOperationException("The GeoEntityLinker requires the 
entire document "
-        + "for proper scoring. This method is unsupported");
-  }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer;
+import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer;
+import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer;
+import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer;
+import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;
+import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer;
+import opennlp.addons.geoentitylinker.scoring.ProvinceProximityScorer;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.EntityLinker;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Links location entities to the USGS and GeoNames gazetteers, and uses 
several
+ * scoring techniques to enable resolution. The gazetteers are stored in lucene
+ * indexes. The indexes can be built using the GeoEntityLinkerSetupUtils class
+ * in this same package.
+ */
+public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
+
+  private static Integer topN = 2;
+  private AdminBoundaryContextGenerator countryContext;
+  private EntityLinkerProperties linkerProperties;
+  private GazetteerSearcher gazateerSearcher;
+  private List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new 
ArrayList<>();
+
+  @Override
+  public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] 
tokensBySentence,
+                               Span[][] namesBySentence) {
+    ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
+
+    if (linkerProperties == null) {
+      throw new IllegalArgumentException("EntityLinkerProperties cannot be 
null");
+    }
+    //countryMentions = countryContext.regexfind(doctext);
+    AdminBoundaryContext context = countryContext.getContext(doctext);
+    for (int s = 0; s < sentences.length; s++) {
+      Span[] names = namesBySentence[s];
+
+      Span[] tokenSpans = tokensBySentence[s];
+      String[] tokens = Span.spansToStrings(tokenSpans, 
sentences[s].getCoveredText(doctext));
+
+      String[] matches = Span.spansToStrings(names, tokens);
+
+      for (int i = 0; i < matches.length; i++) {
+
+        ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
+        if (!context.getWhereClauses().isEmpty()) {
+          for (String whereclause : context.getWhereClauses()) {
+            ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], 
topN, whereclause);
+            for (GazetteerEntry gazetteerEntry : find) {
+              if (!geoNamesEntries.contains(gazetteerEntry)) {
+                geoNamesEntries.add(gazetteerEntry);
+              }
+            }
+
+          }
+        } else {
+          //this means there were no where clauses generated so the where 
clause
+          // will default to look at the entire index
+          ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], 
topN,
+              " gaztype:usgs geonames regions ");
+          for (GazetteerEntry gazetteerEntry : find) {
+            if (!geoNamesEntries.contains(gazetteerEntry)) {
+              geoNamesEntries.add(gazetteerEntry);
+            }
+          }
+        }
+        if (geoNamesEntries.isEmpty()) {
+          continue;
+        }
+
+        // Normalize the returned scores for this name... this will assist the 
sort
+        if (!spans.isEmpty()) {
+
+          Double maxscore = 0d;
+          for (BaseLink gazetteerEntry : geoNamesEntries) {
+            Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
+            if (deNormScore.compareTo(maxscore) > 0) {
+              maxscore = deNormScore;
+            }
+          }
+          for (BaseLink gazetteerEntry : geoNamesEntries) {
+            Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
+            Double normalize = normalize(deNormScore, 0d, maxscore);
+            gazetteerEntry.getScoreMap().put("normlucene", normalize);
+          }
+        }
+        LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0);
+        newspan.setSearchTerm(matches[i]);
+        newspan.setLinkedEntries(geoNamesEntries);
+        newspan.setSentenceid(s);
+        spans.add(newspan);
+      }
+
+    }
+
+    if (!scorers.isEmpty()) {
+      for (LinkedEntityScorer scorer : scorers) {
+        scorer.score(spans, doctext, sentences, linkerProperties, context);
+      }
+    }
+    /**
+     * sort the data with the best score on top based on the sum of the scores
+     * below from the score map for each baselink object
+     */
+    for (LinkedSpan<BaseLink> s : spans) {
+      ArrayList<BaseLink> linkedData = s.getLinkedEntries();
+      Collections.sort(linkedData, Collections.reverseOrder(new 
Comparator<BaseLink>() {
+        @Override
+        public int compare(BaseLink o1, BaseLink o2) {
+          HashMap<String, Double> o1scoreMap = o1.getScoreMap();
+          HashMap<String, Double> o2scoreMap = o2.getScoreMap();
+          if (o1scoreMap.size() != o2scoreMap.size()) {
+            return 0;
+          }
+          double sumo1 = 0d;
+          double sumo2 = 0d;
+          for (String object : o1scoreMap.keySet()) {
+            if (object.equals("typescore")
+                || object.equals("countrycontext")
+                || object.equals("placenamedicecoef")
+                || object.equals("provincecontext")
+                || object.equals("geohashbin")
+                || object.equals("normlucene")) {
+              sumo1 += o1scoreMap.get(object);
+              sumo2 += o2scoreMap.get(object);
+            }
+          }
+
+          return Double.compare(sumo1,
+              sumo2);
+        }
+      }));
+      //prune the list to topN
+      Iterator iterator = linkedData.iterator();
+      int n = 0;
+      while (iterator.hasNext()) {
+        if (n >= topN) {
+          iterator.remove();
+        }
+        iterator.next();
+        n++;
+      }
+    }
+
+    return spans;
+  }
+
+  /**
+   * transposes a value within one range to a relative value in a different
+   * range. Used to normalize distances in this class.
+   *
+   * @param valueToNormalize the value to place within the new range
+   * @param minimum the min of the set to be transposed
+   * @param maximum the max of the set to be transposed
+   * @return
+   */
+  private Double normalize(Double valueToNormalize, Double minimum, Double 
maximum) {
+    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - 
minimum) + 0;
+    d = d == Double.NaN ? 0d : d;
+    return d;
+  }
+
+  private void loadScorers() {
+    if (scorers.isEmpty()) {
+      scorers.add(new ProvinceProximityScorer());
+      scorers.add(new GeoHashBinningScorer());
+      scorers.add(new CountryProximityScorer());
+      scorers.add(new ModelBasedScorer());
+      scorers.add(new FuzzyStringMatchScorer());
+      scorers.add(new PlacetypeScorer());
+    }
+  }
+
+  @Override
+  public void init(EntityLinkerProperties properties) throws IOException {
+
+    this.linkerProperties = properties;
+    countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);
+    gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
+    String rowsRetStr = 
this.linkerProperties.getProperty("opennlp.geoentitylinker.gaz.rowsreturned", 
"2");
+    Integer rws = 2;
+    try {
+      rws = Integer.valueOf(rowsRetStr);
+    } catch (NumberFormatException e) {
+      rws = 2;
+    }
+    topN = rws;
+    loadScorers();
+
+  }
+
+  @Override
+  public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] 
tokensBySentence,
+      Span[][] namesBySentence, int sentenceIndex) {
+    throw new UnsupportedOperationException("The GeoEntityLinker requires the 
entire document "
+        + "for proper scoring. This method is unsupported");
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
index 88ca56f..7d362b7 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
@@ -1,206 +1,208 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.indexing;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileWriter;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.core.KeywordAnalyzer;
-import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.MMapDirectory;
-
-/**
- * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker.
- */
-public class GazetteerIndexer {
-
-  public static void main(String[] args) {
-
-    if (args.length != 8) {
-      System.out.println("Usage: GazetteerIndexer geonamesData 
geoNamesCountryInfo geonamesAdmin1CodesASCII "
-          + "usgsDataFile usgsGovUnitsFile outputIndexDir 
outputCountryContextFile regionsFile");
-      System.out.println();
-      System.out.println("The GazetteerIndexer.index methods javadoc explains 
how to retrieve the data files.");
-      return;
-    }
-
-    File geonamesData = new File(args[0]);
-    File geoNamesCountryInfo = new File(args[1]);
-    File geonamesAdmin1CodesASCII = new File(args[2]);
-    File usgsDataFile = new File(args[3]);
-    File usgsGovUnitsFile = new File(args[4]);
-    File outputIndexDir = new File(args[5]);
-    File outputCountryContextFile = new File(args[6]);
-    File regionsFile = new File(args[7]);
-
-    try {
-      GazetteerIndexer i = new GazetteerIndexer();
-      i.index(geonamesData,
-          geoNamesCountryInfo,
-          geonamesAdmin1CodesASCII,
-          usgsDataFile,
-          usgsGovUnitsFile,
-          outputIndexDir,
-          outputCountryContextFile,
-          regionsFile);
-    } catch (Exception ex) {
-      ex.printStackTrace();
-    }
-  }
-
-  public GazetteerIndexer() {
-
-  }
-
-  public static interface Separable {
-
-    String getSeparator();
-  }
-
-  public enum GazType implements Separable {
-
-    GEONAMES {
-          @Override
-          public String toString() {
-            return "/opennlp_geoentitylinker_geonames_idx";
-          }
-
-          @Override
-          public String getSeparator() {
-            return "\t";
-          }
-        },
-    USGS {
-          @Override
-          public String toString() {
-            return "/opennlp_geoentitylinker_usgsgaz_idx";
-          }
-
-          @Override
-          public String getSeparator() {
-            return "\\|";
-          }
-        }
-  }
-
-  /**
-   *
-   * @param geonamesData the actual Geonames gazetteer data downloaded from
-   * here: http://download.geonames.org/export/dump/ then click on this link
-   * 'allCountries.zip'
-   * @param geoNamesCountryInfo the countryinfo lookup table that can be
-   * downloaded from here
-   * http://download.geonames.org/export/dump/countryInfo.txt You'll need to
-   * copy the page into a file or scrape it
-   * @param geonamesAdmin1CodesASCII The lookup data for the province names for
-   * each place found here:
-   * http://download.geonames.org/export/dump/admin1CodesASCII.txt highlight 
the
-   * table view, and copy results into a text file. Make sure the tab 
delimitted
-   * format is maintained.
-   * @param usgsDataFile the actual USGS gazetteer downloaded from here:
-   * http://geonames.usgs.gov/domestic/download_data.htm click on the
-   * national_file####.zip link to get all the most recent features
-   *
-   * @param usgsGovUnitsFile go to here:
-   * http://geonames.usgs.gov/domestic/download_data.htm in the section titled
-   * "Topical Gazetteers -- File Format" click on the drop down list and select
-   * "Government Units". The downloaded file is what you need for this param.
-   * @param outputIndexDir where you want the final index. Must be a directory,
-   * not an actual file.
-   * @param outputCountryContextFile The output countrycontext file. THis is a
-   * very important file used inside the GeoEntityLinker to assist in toponym
-   * resolution.
-   * @param regionsFile this file contains a list of regions in the following
-   * format: tab delimitted text with index 0 as the name of the region, index 
1
-   * as the longitude, and index 2 as the latitude
-   * @throws Exception
-   */
-  public void index(File geonamesData, File geoNamesCountryInfo, File 
geonamesAdmin1CodesASCII,
-      File usgsDataFile, File usgsGovUnitsFile, File outputIndexDir, File 
outputCountryContextFile, File regionsFile) throws Exception {
-    if (!outputIndexDir.isDirectory()) {
-      throw new IllegalArgumentException("outputIndexDir must be a 
directory.");
-    }
-    if (!geonamesData.exists()) {
-      throw new FileNotFoundException("geonames data file does not exist");
-    }
-    if (!geoNamesCountryInfo.exists()) {
-      throw new FileNotFoundException("geoNamesCountryCodes data file does not 
exist");
-    }
-    if (!geonamesAdmin1CodesASCII.exists()) {
-      throw new FileNotFoundException("geonamesAdmin1CodesASCII data file does 
not exist");
-    }
-
-    if (!usgsDataFile.exists()) {
-      throw new FileNotFoundException("usgsDataFile data file does not exist");
-    }
-    if (!usgsGovUnitsFile.exists()) {
-      throw new FileNotFoundException("usgsGovUnitsFile data file does not 
exist");
-    }
-    if (!outputIndexDir.exists()) {
-      throw new FileNotFoundException("outputIndexDir data file does not 
exist");
-    }
-    if (!regionsFile.exists()) {
-      throw new FileNotFoundException("regionsFile data file does not exist");
-    }
-
-    String indexloc = outputIndexDir.getPath() + 
"/opennlp_geoentitylinker_gazetteer";
-    Directory index = new MMapDirectory(Paths.get(indexloc));
-    Analyzer a = new StandardAnalyzer(new CharArraySet(new ArrayList(), true));
-    Map<String, Analyzer> analyMap = new HashMap<>();
-
-    analyMap.put("countrycode", new KeywordAnalyzer());
-    analyMap.put("admincode", new KeywordAnalyzer());
-    analyMap.put("loctype", new KeywordAnalyzer());
-    analyMap.put("countycode", new KeywordAnalyzer());
-    analyMap.put("gazsource", new KeywordAnalyzer());
-
-    PerFieldAnalyzerWrapper aWrapper
-        = new PerFieldAnalyzerWrapper(a, analyMap);
-
-    IndexWriterConfig config = new IndexWriterConfig(aWrapper);
-
-    IndexWriter w = new IndexWriter(index, config);
-    
-    //write the column headers for the countryContextFile 
-    FileWriter countryContextFileWriter = new 
FileWriter(outputCountryContextFile, false);
-    String colNamesForCountryContextFile = 
"countrycode\tprovcode\tcountycode\tcountryname\tprovincename\tcountyname\tcountryregex\tprovregex\tcountyregex\n";
-    countryContextFileWriter.write(colNamesForCountryContextFile);
-    countryContextFileWriter.flush();
-    countryContextFileWriter.close();
-    
-    
-    USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, 
outputCountryContextFile, w);
-
-    GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII, 
geonamesData, outputCountryContextFile, w);
-
-    RegionProcessor.process(regionsFile, outputCountryContextFile, w);
-    w.commit();
-    w.close();
-    System.out.println("\nIndexing complete. Be sure to add '" + indexloc + "' 
and context file '" + outputCountryContextFile.getPath() + "' to 
entitylinker.properties file");
-  }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
+
+/**
+ * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker.
+ */
+public class GazetteerIndexer {
+
+  public static void main(String[] args) {
+
+    if (args.length != 8) {
+      System.out.println("Usage: GazetteerIndexer geonamesData 
geoNamesCountryInfo geonamesAdmin1CodesASCII "
+          + "usgsDataFile usgsGovUnitsFile outputIndexDir 
outputCountryContextFile regionsFile");
+      System.out.println();
+      System.out.println("The GazetteerIndexer.index methods javadoc explains 
how to retrieve the data files.");
+      return;
+    }
+
+    File geonamesData = new File(args[0]);
+    File geoNamesCountryInfo = new File(args[1]);
+    File geonamesAdmin1CodesASCII = new File(args[2]);
+    File usgsDataFile = new File(args[3]);
+    File usgsGovUnitsFile = new File(args[4]);
+    File outputIndexDir = new File(args[5]);
+    File outputCountryContextFile = new File(args[6]);
+    File regionsFile = new File(args[7]);
+
+    try {
+      GazetteerIndexer i = new GazetteerIndexer();
+      i.index(geonamesData,
+          geoNamesCountryInfo,
+          geonamesAdmin1CodesASCII,
+          usgsDataFile,
+          usgsGovUnitsFile,
+          outputIndexDir,
+          outputCountryContextFile,
+          regionsFile);
+    } catch (Exception ex) {
+      ex.printStackTrace();
+    }
+  }
+
+  public GazetteerIndexer() {
+
+  }
+
+  public static interface Separable {
+
+    String getSeparator();
+  }
+
+  public enum GazType implements Separable {
+
+    GEONAMES {
+          @Override
+          public String toString() {
+            return "/opennlp_geoentitylinker_geonames_idx";
+          }
+
+          @Override
+          public String getSeparator() {
+            return "\t";
+          }
+        },
+    USGS {
+          @Override
+          public String toString() {
+            return "/opennlp_geoentitylinker_usgsgaz_idx";
+          }
+
+          @Override
+          public String getSeparator() {
+            return "\\|";
+          }
+        }
+  }
+
+  /**
+   *
+   * @param geonamesData the actual Geonames gazetteer data downloaded from
+   * here: http://download.geonames.org/export/dump/ then click on this link
+   * 'allCountries.zip'
+   * @param geoNamesCountryInfo the countryinfo lookup table that can be
+   * downloaded from here
+   * http://download.geonames.org/export/dump/countryInfo.txt You'll need to
+   * copy the page into a file or scrape it
+   * @param geonamesAdmin1CodesASCII The lookup data for the province names for
+   * each place found here:
+   * http://download.geonames.org/export/dump/admin1CodesASCII.txt highlight 
the
+   * table view, and copy results into a text file. Make sure the tab 
delimitted
+   * format is maintained.
+   * @param usgsDataFile the actual USGS gazetteer downloaded from here:
+   * http://geonames.usgs.gov/domestic/download_data.htm click on the
+   * national_file####.zip link to get all the most recent features
+   *
+   * @param usgsGovUnitsFile go to here:
+   * http://geonames.usgs.gov/domestic/download_data.htm in the section titled
+   * "Topical Gazetteers -- File Format" click on the drop down list and select
+   * "Government Units". The downloaded file is what you need for this param.
+   * @param outputIndexDir where you want the final index. Must be a directory,
+   * not an actual file.
+   * @param outputCountryContextFile The output countrycontext file. THis is a
+   * very important file used inside the GeoEntityLinker to assist in toponym
+   * resolution.
+   * @param regionsFile this file contains a list of regions in the following
+   * format: tab delimitted text with index 0 as the name of the region, index 
1
+   * as the longitude, and index 2 as the latitude
+   * @throws Exception
+   */
+  public void index(File geonamesData, File geoNamesCountryInfo, File 
geonamesAdmin1CodesASCII,
+      File usgsDataFile, File usgsGovUnitsFile, File outputIndexDir, File 
outputCountryContextFile, File regionsFile) throws Exception {
+    if (!outputIndexDir.isDirectory()) {
+      throw new IllegalArgumentException("outputIndexDir must be a 
directory.");
+    }
+    if (!geonamesData.exists()) {
+      throw new FileNotFoundException("geonames data file does not exist");
+    }
+    if (!geoNamesCountryInfo.exists()) {
+      throw new FileNotFoundException("geoNamesCountryCodes data file does not 
exist");
+    }
+    if (!geonamesAdmin1CodesASCII.exists()) {
+      throw new FileNotFoundException("geonamesAdmin1CodesASCII data file does 
not exist");
+    }
+
+    if (!usgsDataFile.exists()) {
+      throw new FileNotFoundException("usgsDataFile data file does not exist");
+    }
+    if (!usgsGovUnitsFile.exists()) {
+      throw new FileNotFoundException("usgsGovUnitsFile data file does not 
exist");
+    }
+    if (!outputIndexDir.exists()) {
+      throw new FileNotFoundException("outputIndexDir data file does not 
exist");
+    }
+    if (!regionsFile.exists()) {
+      throw new FileNotFoundException("regionsFile data file does not exist");
+    }
+
+    String indexloc = outputIndexDir.getPath() + 
"/opennlp_geoentitylinker_gazetteer";
+    Directory index = new MMapDirectory(Paths.get(indexloc));
+    Analyzer a = new StandardAnalyzer(new CharArraySet(new ArrayList(), true));
+    Map<String, Analyzer> analyMap = new HashMap<>();
+
+    analyMap.put("countrycode", new KeywordAnalyzer());
+    analyMap.put("admincode", new KeywordAnalyzer());
+    analyMap.put("loctype", new KeywordAnalyzer());
+    analyMap.put("countycode", new KeywordAnalyzer());
+    analyMap.put("gazsource", new KeywordAnalyzer());
+
+    PerFieldAnalyzerWrapper aWrapper
+        = new PerFieldAnalyzerWrapper(a, analyMap);
+
+    IndexWriterConfig config = new IndexWriterConfig(aWrapper);
+
+    IndexWriter w = new IndexWriter(index, config);
+    
+    //write the column headers for the countryContextFile 
+    FileWriter countryContextFileWriter = new 
FileWriter(outputCountryContextFile, false);
+    String colNamesForCountryContextFile = 
"countrycode\tprovcode\tcountycode\tcountryname\tprovincename\tcountyname\tcountryregex\tprovregex\tcountyregex\n";
+    countryContextFileWriter.write(colNamesForCountryContextFile);
+    countryContextFileWriter.flush();
+    countryContextFileWriter.close();
+    
+    
+    USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, 
outputCountryContextFile, w);
+
+    GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII, 
geonamesData, outputCountryContextFile, w);
+
+    RegionProcessor.process(regionsFile, outputCountryContextFile, w);
+    w.commit();
+    w.close();
+    System.out.println("\nIndexing complete. Be sure to add '" + indexloc + "' 
and context file '" + outputCountryContextFile.getPath() + "' to 
entitylinker.properties file");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
index 4afd96d..fec3310 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
@@ -1,139 +1,141 @@
-/*
- * Copyright 2014 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.indexing;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.net.URL;
-import java.net.URLConnection;
-import java.util.Enumeration;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipFile;
-
-public class GeonamesFileDownloader {
-
-  final static int size = 1024;
-  private static final String ALL_COUNTRIES = 
"http://download.geonames.org/export/dump/ZM.zip";;
-  private static final String COUNTRY_INFO = "";
-  private static final String ADM1_LOOKUP = "";
-
-  public static void main(String[] args) {
-    downloadGeonamesFiles(COUNTRY_INFO, "c:\\temp\\gazetteers");
-  }
-
-  public static void downloadGeonamesFiles(String outputFileName, String 
outputDir) {
-    String fileDownload = fileDownload(ALL_COUNTRIES, outputDir);
-
-    unzipMyZip(fileDownload, outputDir);
-
-    fileDownload(COUNTRY_INFO, outputDir);
-    fileDownload(ADM1_LOOKUP, outputDir);
-
-  }
-
-  public static final void writeFile(InputStream in, OutputStream out)
-          throws IOException {
-    byte[] buffer = new byte[1024];
-    int len;
-
-    while ((len = in.read(buffer)) != 0) {
-      out.write(buffer, 0, len);
-    }
-
-    in.close();
-    out.close();
-  }
-
-  public static void unzipMyZip(String zipFileName,
-          String directoryToExtractTo) {
-    Enumeration entriesEnum;
-    ZipFile zip;
-    try {
-      zip = new ZipFile(zipFileName);
-      entriesEnum = zip.entries();
-      while (entriesEnum.hasMoreElements()) {
-        ZipEntry entry = (ZipEntry) entriesEnum.nextElement();
-        InputStream is = zip.getInputStream(entry); // get the input stream
-        OutputStream os = new java.io.FileOutputStream(new 
File(zipFileName.replace("\\.zip", ".txt")));
-        byte[] buf = new byte[4096];
-        int r;
-        while ((r = is.read(buf)) != -1) {
-          os.write(buf, 0, r);
-        }
-        os.close();
-        is.close();
-      }
-    } catch (IOException ioe) {
-      System.err.println("Some Exception Occurred:");
-      ioe.printStackTrace();
-      return;
-    }
-  }
-
-  public static String fileUrl(String fAddress, String localFileName, String 
destinationDir) {
-    OutputStream outStream = null;
-    URLConnection uCon = null;
-    String filename = destinationDir + "\\" + localFileName;
-    InputStream is = null;
-    try {
-      URL Url;
-      byte[] buf;
-      int ByteRead, ByteWritten = 0;
-      Url = new URL(fAddress);
-      outStream = new BufferedOutputStream(new FileOutputStream(destinationDir 
+ "\\" + localFileName));
-
-      uCon = Url.openConnection();
-      is = uCon.getInputStream();
-      buf = new byte[size];
-      while ((ByteRead = is.read(buf)) != -1) {
-        outStream.write(buf, 0, ByteRead);
-        ByteWritten += ByteRead;
-      }
-      System.out.println("Downloaded Successfully.");
-      System.out.println("File name:\"" + localFileName + "\"\nNo ofbytes :" + 
ByteWritten);
-    } catch (Exception e) {
-      e.printStackTrace();
-    } finally {
-      try {
-        is.close();
-        outStream.close();
-      } catch (IOException e) {
-        e.printStackTrace();
-      }
-    }
-    return filename;
-  }
-
-  public static String fileDownload(String fAddress, String destinationDir) {
-    int slashIndex = fAddress.lastIndexOf('/');
-    int periodIndex = fAddress.lastIndexOf('.');
-
-    String fileName = fAddress.substring(slashIndex + 1);
-    String retFileName = "";
-    if (periodIndex >= 1 && slashIndex >= 0
-            && slashIndex < fAddress.length() - 1) {
-      retFileName = fileUrl(fAddress, fileName, destinationDir);
-    } else {
-      System.err.println("path or file name.");
-    }
-    return retFileName;
-  }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.Enumeration;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+
+public class GeonamesFileDownloader {
+
+  final static int size = 1024;
+  private static final String ALL_COUNTRIES = 
"http://download.geonames.org/export/dump/ZM.zip";;
+  private static final String COUNTRY_INFO = "";
+  private static final String ADM1_LOOKUP = "";
+
+  public static void main(String[] args) {
+    downloadGeonamesFiles(COUNTRY_INFO, "c:\\temp\\gazetteers");
+  }
+
+  public static void downloadGeonamesFiles(String outputFileName, String 
outputDir) {
+    String fileDownload = fileDownload(ALL_COUNTRIES, outputDir);
+
+    unzipMyZip(fileDownload, outputDir);
+
+    fileDownload(COUNTRY_INFO, outputDir);
+    fileDownload(ADM1_LOOKUP, outputDir);
+
+  }
+
+  public static final void writeFile(InputStream in, OutputStream out)
+          throws IOException {
+    byte[] buffer = new byte[1024];
+    int len;
+
+    while ((len = in.read(buffer)) != 0) {
+      out.write(buffer, 0, len);
+    }
+
+    in.close();
+    out.close();
+  }
+
+  public static void unzipMyZip(String zipFileName,
+          String directoryToExtractTo) {
+    Enumeration entriesEnum;
+    ZipFile zip;
+    try {
+      zip = new ZipFile(zipFileName);
+      entriesEnum = zip.entries();
+      while (entriesEnum.hasMoreElements()) {
+        ZipEntry entry = (ZipEntry) entriesEnum.nextElement();
+        InputStream is = zip.getInputStream(entry); // get the input stream
+        OutputStream os = new java.io.FileOutputStream(new 
File(zipFileName.replace("\\.zip", ".txt")));
+        byte[] buf = new byte[4096];
+        int r;
+        while ((r = is.read(buf)) != -1) {
+          os.write(buf, 0, r);
+        }
+        os.close();
+        is.close();
+      }
+    } catch (IOException ioe) {
+      System.err.println("Some Exception Occurred:");
+      ioe.printStackTrace();
+      return;
+    }
+  }
+
+  public static String fileUrl(String fAddress, String localFileName, String 
destinationDir) {
+    OutputStream outStream = null;
+    URLConnection uCon = null;
+    String filename = destinationDir + "\\" + localFileName;
+    InputStream is = null;
+    try {
+      URL Url;
+      byte[] buf;
+      int ByteRead, ByteWritten = 0;
+      Url = new URL(fAddress);
+      outStream = new BufferedOutputStream(new FileOutputStream(destinationDir 
+ "\\" + localFileName));
+
+      uCon = Url.openConnection();
+      is = uCon.getInputStream();
+      buf = new byte[size];
+      while ((ByteRead = is.read(buf)) != -1) {
+        outStream.write(buf, 0, ByteRead);
+        ByteWritten += ByteRead;
+      }
+      System.out.println("Downloaded Successfully.");
+      System.out.println("File name:\"" + localFileName + "\"\nNo ofbytes :" + 
ByteWritten);
+    } catch (Exception e) {
+      e.printStackTrace();
+    } finally {
+      try {
+        is.close();
+        outStream.close();
+      } catch (IOException e) {
+        e.printStackTrace();
+      }
+    }
+    return filename;
+  }
+
+  public static String fileDownload(String fAddress, String destinationDir) {
+    int slashIndex = fAddress.lastIndexOf('/');
+    int periodIndex = fAddress.lastIndexOf('.');
+
+    String fileName = fAddress.substring(slashIndex + 1);
+    String retFileName = "";
+    if (periodIndex >= 1 && slashIndex >= 0
+            && slashIndex < fAddress.length() - 1) {
+      retFileName = fileUrl(fAddress, fileName, destinationDir);
+    } else {
+      System.err.println("path or file name.");
+    }
+    return retFileName;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
----------------------------------------------------------------------
diff --git 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
index 8b57aaa..30cdea5 100644
--- 
a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
+++ 
b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
@@ -1,294 +1,298 @@
-/*
- * Copyright 2014 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.indexing;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import opennlp.addons.geoentitylinker.AdminBoundary;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.IndexWriter;
-
-public class GeonamesProcessor {
-
-  public static void process(File countryCodesLookupFile, File 
adm1CodesLookupFile, File geonamesGazetteerFile, File outputCountryContextFile, 
IndexWriter w) throws Exception {
-    Map<String, String> countryCodes = getCountryCodes(countryCodesLookupFile);
-
-    Map<String, AdminBoundary> adm1s = getProvData(adm1CodesLookupFile, 
countryCodes);
-    //  List<AdminBoundary> adm2s = getCountryContextFromFile(new 
File("C:\\temp\\gazetteers\\geonamesdata\\admin2Codes.txt"));
-    //admin2Codes.txt
-
-    readFile(geonamesGazetteerFile, GazetteerIndexer.GazType.GEONAMES, adm1s, 
countryCodes, w);
-    //now append to the coutnry context file
-    writeCountryContextFile(outputCountryContextFile, adm1s);
-
-  }
-
-  public GeonamesProcessor() {
-  }
-
-  private static Map<String, AdminBoundary> getProvData(File 
adm1CodesLookupFile, Map<String, String> ccodes) {
-    System.out.println("Attempting to read geonames province data from: " + 
adm1CodesLookupFile.getPath());
-
-    Map<String, AdminBoundary> outmap = new HashMap<>();
-    BufferedReader reader;
-    Set<String> nullcodes = new HashSet<>();
-    try {
-
-      reader = new BufferedReader(new FileReader(adm1CodesLookupFile));
-      int i = 0;
-      String line = "";
-      while ((line = reader.readLine()) != null) {
-
-        // String line = reader.readLine();
-        String[] values = line.split("\t");
-        if (values.length != 4) {
-          throw new IOException("improperly formatted province lookup file");
-        }
-        String ccode = values[0].toLowerCase();
-
-        String[] split = ccode.split("\\.");
-        String pcode = "";
-        if (split.length == 2) {
-          //System.out.println(split);
-          ccode = split[0];
-          pcode = split[1];
-        }
-
-        String pname = values[2];
-
-        if (ccode.matches("[0-9].*")) {
-          String code = ccode;
-          ccode = pcode;
-          pcode = code;
-        }
-
-        String cname = ccodes.get(ccode);
-
-        if (cname == null) {
-          nullcodes.add(ccode);
-        }
-        AdminBoundary data = new AdminBoundary(ccode, cname, pcode, pname, 
"NO_DATA_FOUND", "NO_DATA_FOUND", cname, pname, "NO_DATA_FOUND");
-        //  System.out.println(data);
-        outmap.put(ccode + "." + pcode, data);
-
-      }
-      System.out.println("INFO: there were " + nullcodes.size() + " null prov 
codes. This is due to inconsistencies in reference data.");
-      reader.close();
-    } catch (IOException ex) {
-      ex.printStackTrace();
-    }
-    System.out.println("Successfully read geonames province data from: " + 
adm1CodesLookupFile.getPath());
-
-    return outmap;
-
-  }
-
-  private static Map<String, String> getCountryCodes(File countryContextFile) {
-    Map<String, String> ccs = new HashMap<>();
-    BufferedReader reader;
-    try {
-
-      reader = new BufferedReader(new FileReader(countryContextFile));
-      int i = 0;
-      String line = "";
-      boolean start = false;
-      while ((line = reader.readLine()) != null) {
-        if (!line.toLowerCase().startsWith("#iso\t") && !start) {
-
-          continue;
-        } else {
-          start = true;
-        }
-        String[] values = line.split("\t");
-
-        String ccode = values[0].toLowerCase();//this is the 2 digit ISO code
-        String cname = values[4].toLowerCase();
-        if (!ccode.equals("")) {
-          ccs.put(ccode, cname);
-        }
-
-      }
-      reader.close();
-    } catch (IOException ex) {
-      ex.printStackTrace();
-    }
-    ccs.put("SS", "South Sudan");
-    ccs.put("CS", "Kosovo");
-    return ccs;
-
-  }
-
-  public static void writeCountryContextFile(File outfile, Map<String, 
AdminBoundary> adms) {
-    // FileWriter writer = null;
-    try (FileWriter writer = new FileWriter(outfile, true)) {
-
-      for (String admKey : adms.keySet()) {
-        AdminBoundary adm = adms.get(admKey);
-        if (adm == null) {
-          continue;
-        }
-        String province = adm.getProvinceName();
-        String country = adm.getCountryName();
-
-        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + 
"" + "\t" + country + "\t" + province + "\t" + "" + "\t" + "(" + country + ")" 
+ "\t"
-            + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n";
-        writer.write(line);
-        // System.out.println(line);
-
-      }
-
-    } catch (IOException ex) {
-      ex.printStackTrace();
-    }
-    System.out.println("successfully wrote Geonames entries to country oontext 
file");
-  }
-
-  /**
-   *
-   * @param gazateerInputData the Geonames allCounties.txt file
-   * @param type the types of gaz entry, usgs, geonames, or regions
-   * @param adms the province info
-   * @param countrycodes the country code info
-   * @param w the lucene index writer
-   * @throws Exception
-   */
-  public static void readFile(File gazateerInputData, GazetteerIndexer.GazType 
type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, 
IndexWriter w) throws Exception {
-
-    BufferedReader reader = new BufferedReader(new 
FileReader(gazateerInputData));
-    String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD 
ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH 
PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT".split(" ");
-    Map<String, Float> boostMap = new HashMap<>();
-    for (String boost : boosts) {
-      boostMap.put(boost.toLowerCase(), 10f);
-    }
-    String[] fieldStrings = new String[]{
-      "geonameid",
-      "name",
-      "asciiname",
-      "alternatenames",
-      "latitude",
-      "longitude",
-      "feature_class",
-      "feature_code",
-      "country code",
-      "cc2",
-      "admin1_code",
-      "admin2_code",
-      "admin3_code",
-      "admin4_code",
-      "population",
-      "elevation",
-      "dem ",
-      "timezone",
-      "modification_date"};
-
-    List<String> fields = Arrays.asList(fieldStrings);
-    int counter = 0;
-    System.out.println("reading gazetteer data from file...........");
-    String line = "";
-    while ((line = reader.readLine()) != null) {
-      String[] values = line.split(type.getSeparator());
-
-      Document doc = new Document();
-      String admincode = values[10].toLowerCase();
-      String ccode = values[8].toLowerCase();
-      if (ccode.contains(",")) {
-        String[] codes = ccode.split(",");
-        if (codes.length > 0) {
-          ccode = codes[0];
-        }
-      }
-      AdminBoundary adm = adms.get(ccode + "." + admincode);
-
-      String placeName = values[2];
-      String lat = values[4];
-      String lon = values[5];
-      String dsg = values[7].toLowerCase();
-
-      String id = values[0];
-      String concatIndexEntry = "";
-      String countryname = "";
-      if (adm != null) {
-        concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() 
+ ", " + placeName;
-        countryname = adm.getCountryName();
-      } else {
-        //there is no admin info, but we can still use the countrycode to 
concat the country name
-        String n = countrycodes.get(ccode);
-        countryname = n;
-        if (n != null) {
-          concatIndexEntry = n + ", " + placeName;
-        } else {
-          ///don't want a single token hierarchy entry.
-          concatIndexEntry = "";
-        }
-      }
-      if (ccode == null) {
-        System.out.println("naughty country code");
-      }
-      for (int i = 0; i < fields.size() - 1; i++) {
-        doc.add(new TextField(fields.get(i), values[i].trim(), 
Field.Store.YES));
-
-      }
-      if (dsg.equals("pcli")) {
-        System.out.println("placename: " + placeName + " RESET TO: " + 
countryname);
-        placeName = countryname;
-      }
-      /**
-       * add standard fields to the index
-       */
-      doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));
-      doc.add(new TextField("placename", placeName, Field.Store.YES));
-      // doc.add(new TextField("countryname", countryname, Field.Store.YES));
-      //System.out.println(placeName);
-
-      doc.add(new TextField("latitude", lat, Field.Store.YES));
-      doc.add(new TextField("longitude", lon, Field.Store.YES));
-      doc.add(new StringField("loctype", dsg, Field.Store.YES));
-      doc.add(new StringField("admincode", (ccode + "." + 
admincode).toLowerCase(), Field.Store.YES));
-      doc.add(new StringField("countrycode", ccode.toLowerCase(), 
Field.Store.YES));
-      doc.add(new StringField("countycode", "", Field.Store.YES));
-      doc.add(new StringField("locid", id, Field.Store.YES));
-      placeName = placeName.replace("republic of", "").replace("federative", 
"");
-      if (id.equals("3175395")) {
-        System.out.println(placeName);
-      }
-      doc.add(new StringField("gazsource", "geonames", Field.Store.YES));
-
-      w.addDocument(doc);
-
-      counter++;
-      if (counter % 100000 == 0) {
-        w.commit();
-        System.out.println(counter + " .........Geonames entries committed to 
index..............");
-      }
-
-    }
-
-    System.out.println("Completed indexing geonames gaz! index name is: " + 
type.toString());
-  }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+
+import opennlp.addons.geoentitylinker.AdminBoundary;
+
+public class GeonamesProcessor {
+
+  public static void process(File countryCodesLookupFile, File 
adm1CodesLookupFile, File geonamesGazetteerFile, File outputCountryContextFile, 
IndexWriter w) throws Exception {
+    Map<String, String> countryCodes = getCountryCodes(countryCodesLookupFile);
+
+    Map<String, AdminBoundary> adm1s = getProvData(adm1CodesLookupFile, 
countryCodes);
+    //  List<AdminBoundary> adm2s = getCountryContextFromFile(new 
File("C:\\temp\\gazetteers\\geonamesdata\\admin2Codes.txt"));
+    //admin2Codes.txt
+
+    readFile(geonamesGazetteerFile, GazetteerIndexer.GazType.GEONAMES, adm1s, 
countryCodes, w);
+    //now append to the coutnry context file
+    writeCountryContextFile(outputCountryContextFile, adm1s);
+
+  }
+
+  public GeonamesProcessor() {
+  }
+
+  private static Map<String, AdminBoundary> getProvData(File 
adm1CodesLookupFile, Map<String, String> ccodes) {
+    System.out.println("Attempting to read geonames province data from: " + 
adm1CodesLookupFile.getPath());
+
+    Map<String, AdminBoundary> outmap = new HashMap<>();
+    BufferedReader reader;
+    Set<String> nullcodes = new HashSet<>();
+    try {
+
+      reader = new BufferedReader(new FileReader(adm1CodesLookupFile));
+      int i = 0;
+      String line = "";
+      while ((line = reader.readLine()) != null) {
+
+        // String line = reader.readLine();
+        String[] values = line.split("\t");
+        if (values.length != 4) {
+          throw new IOException("improperly formatted province lookup file");
+        }
+        String ccode = values[0].toLowerCase();
+
+        String[] split = ccode.split("\\.");
+        String pcode = "";
+        if (split.length == 2) {
+          //System.out.println(split);
+          ccode = split[0];
+          pcode = split[1];
+        }
+
+        String pname = values[2];
+
+        if (ccode.matches("[0-9].*")) {
+          String code = ccode;
+          ccode = pcode;
+          pcode = code;
+        }
+
+        String cname = ccodes.get(ccode);
+
+        if (cname == null) {
+          nullcodes.add(ccode);
+        }
+        AdminBoundary data = new AdminBoundary(ccode, cname, pcode, pname, 
"NO_DATA_FOUND", "NO_DATA_FOUND", cname, pname, "NO_DATA_FOUND");
+        //  System.out.println(data);
+        outmap.put(ccode + "." + pcode, data);
+
+      }
+      System.out.println("INFO: there were " + nullcodes.size() + " null prov 
codes. This is due to inconsistencies in reference data.");
+      reader.close();
+    } catch (IOException ex) {
+      ex.printStackTrace();
+    }
+    System.out.println("Successfully read geonames province data from: " + 
adm1CodesLookupFile.getPath());
+
+    return outmap;
+
+  }
+
+  private static Map<String, String> getCountryCodes(File countryContextFile) {
+    Map<String, String> ccs = new HashMap<>();
+    BufferedReader reader;
+    try {
+
+      reader = new BufferedReader(new FileReader(countryContextFile));
+      int i = 0;
+      String line = "";
+      boolean start = false;
+      while ((line = reader.readLine()) != null) {
+        if (!line.toLowerCase().startsWith("#iso\t") && !start) {
+
+          continue;
+        } else {
+          start = true;
+        }
+        String[] values = line.split("\t");
+
+        String ccode = values[0].toLowerCase();//this is the 2 digit ISO code
+        String cname = values[4].toLowerCase();
+        if (!ccode.equals("")) {
+          ccs.put(ccode, cname);
+        }
+
+      }
+      reader.close();
+    } catch (IOException ex) {
+      ex.printStackTrace();
+    }
+    ccs.put("SS", "South Sudan");
+    ccs.put("CS", "Kosovo");
+    return ccs;
+
+  }
+
+  public static void writeCountryContextFile(File outfile, Map<String, 
AdminBoundary> adms) {
+    // FileWriter writer = null;
+    try (FileWriter writer = new FileWriter(outfile, true)) {
+
+      for (String admKey : adms.keySet()) {
+        AdminBoundary adm = adms.get(admKey);
+        if (adm == null) {
+          continue;
+        }
+        String province = adm.getProvinceName();
+        String country = adm.getCountryName();
+
+        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + 
"" + "\t" + country + "\t" + province + "\t" + "" + "\t" + "(" + country + ")" 
+ "\t"
+            + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n";
+        writer.write(line);
+        // System.out.println(line);
+
+      }
+
+    } catch (IOException ex) {
+      ex.printStackTrace();
+    }
+    System.out.println("successfully wrote Geonames entries to country oontext 
file");
+  }
+
+  /**
+   *
+   * @param gazateerInputData the Geonames allCounties.txt file
+   * @param type the types of gaz entry, usgs, geonames, or regions
+   * @param adms the province info
+   * @param countrycodes the country code info
+   * @param w the lucene index writer
+   * @throws Exception
+   */
+  public static void readFile(File gazateerInputData, GazetteerIndexer.GazType 
type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, 
IndexWriter w) throws Exception {
+
+    BufferedReader reader = new BufferedReader(new 
FileReader(gazateerInputData));
+    String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD 
ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH 
PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT".split(" ");
+    Map<String, Float> boostMap = new HashMap<>();
+    for (String boost : boosts) {
+      boostMap.put(boost.toLowerCase(), 10f);
+    }
+    String[] fieldStrings = new String[]{
+      "geonameid",
+      "name",
+      "asciiname",
+      "alternatenames",
+      "latitude",
+      "longitude",
+      "feature_class",
+      "feature_code",
+      "country code",
+      "cc2",
+      "admin1_code",
+      "admin2_code",
+      "admin3_code",
+      "admin4_code",
+      "population",
+      "elevation",
+      "dem ",
+      "timezone",
+      "modification_date"};
+
+    List<String> fields = Arrays.asList(fieldStrings);
+    int counter = 0;
+    System.out.println("reading gazetteer data from file...........");
+    String line = "";
+    while ((line = reader.readLine()) != null) {
+      String[] values = line.split(type.getSeparator());
+
+      Document doc = new Document();
+      String admincode = values[10].toLowerCase();
+      String ccode = values[8].toLowerCase();
+      if (ccode.contains(",")) {
+        String[] codes = ccode.split(",");
+        if (codes.length > 0) {
+          ccode = codes[0];
+        }
+      }
+      AdminBoundary adm = adms.get(ccode + "." + admincode);
+
+      String placeName = values[2];
+      String lat = values[4];
+      String lon = values[5];
+      String dsg = values[7].toLowerCase();
+
+      String id = values[0];
+      String concatIndexEntry = "";
+      String countryname = "";
+      if (adm != null) {
+        concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() 
+ ", " + placeName;
+        countryname = adm.getCountryName();
+      } else {
+        //there is no admin info, but we can still use the countrycode to 
concat the country name
+        String n = countrycodes.get(ccode);
+        countryname = n;
+        if (n != null) {
+          concatIndexEntry = n + ", " + placeName;
+        } else {
+          ///don't want a single token hierarchy entry.
+          concatIndexEntry = "";
+        }
+      }
+      if (ccode == null) {
+        System.out.println("naughty country code");
+      }
+      for (int i = 0; i < fields.size() - 1; i++) {
+        doc.add(new TextField(fields.get(i), values[i].trim(), 
Field.Store.YES));
+
+      }
+      if (dsg.equals("pcli")) {
+        System.out.println("placename: " + placeName + " RESET TO: " + 
countryname);
+        placeName = countryname;
+      }
+      /**
+       * add standard fields to the index
+       */
+      doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));
+      doc.add(new TextField("placename", placeName, Field.Store.YES));
+      // doc.add(new TextField("countryname", countryname, Field.Store.YES));
+      //System.out.println(placeName);
+
+      doc.add(new TextField("latitude", lat, Field.Store.YES));
+      doc.add(new TextField("longitude", lon, Field.Store.YES));
+      doc.add(new StringField("loctype", dsg, Field.Store.YES));
+      doc.add(new StringField("admincode", (ccode + "." + 
admincode).toLowerCase(), Field.Store.YES));
+      doc.add(new StringField("countrycode", ccode.toLowerCase(), 
Field.Store.YES));
+      doc.add(new StringField("countycode", "", Field.Store.YES));
+      doc.add(new StringField("locid", id, Field.Store.YES));
+      placeName = placeName.replace("republic of", "").replace("federative", 
"");
+      if (id.equals("3175395")) {
+        System.out.println(placeName);
+      }
+      doc.add(new StringField("gazsource", "geonames", Field.Store.YES));
+
+      w.addDocument(doc);
+
+      counter++;
+      if (counter % 100000 == 0) {
+        w.commit();
+        System.out.println(counter + " .........Geonames entries committed to 
index..............");
+      }
+
+    }
+
+    System.out.println("Completed indexing geonames gaz! index name is: " + 
type.toString());
+  }
+
+}

[3/4] opennlp-addons git commit: Fix checkstyle errors in geoentitylinker

Reply via email to