This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1700-Modernize-code-of-geolinker-addon in repository https://gitbox.apache.org/repos/asf/opennlp-addons.git
commit 9a12bdf411391fbea8f78ce3931456f6e90c4495 Author: Martin Wiesner <[email protected]> AuthorDate: Thu Jan 23 10:28:29 2025 +0100 OPENNLP-1700: Modernize code of geolinker addon - fixes use of raw types - fixes use of readers/writers without try-with-resources - adds constants where applicable - fixes indentations where necessary - adds JavaDoc where useful - removes superfluous JavaDoc --- .../addons/geoentitylinker/AdminBoundary.java | 105 +-------- .../geoentitylinker/AdminBoundaryContext.java | 27 +-- .../AdminBoundaryContextGenerator.java | 95 ++++---- .../geoentitylinker/CountryContextEntry.java | 101 +-------- .../addons/geoentitylinker/GazetteerEntry.java | 77 +++---- .../geoentitylinker/GazetteerSearchCache.java | 26 +-- .../addons/geoentitylinker/GazetteerSearcher.java | 20 +- .../addons/geoentitylinker/GeoEntityLinker.java | 71 +++--- .../geoentitylinker/indexing/GazetteerIndexer.java | 85 ++++--- .../indexing/GeonamesFileDownloader.java | 14 -- .../indexing/GeonamesProcessor.java | 251 ++++++++++----------- .../geoentitylinker/indexing/RegionProcessor.java | 97 ++++---- .../geoentitylinker/indexing/USGSProcessor.java | 49 ++-- .../scoring/CountryProximityScorer.java | 13 +- .../scoring/FuzzyStringMatchScorer.java | 23 +- .../scoring/GeoHashBinningScorer.java | 19 +- .../scoring/LinkedEntityScorer.java | 29 ++- .../geoentitylinker/scoring/ModelBasedScorer.java | 21 +- .../geoentitylinker/scoring/PlacetypeScorer.java | 13 +- .../geoentitylinker/scoring/PointClustering.java | 13 +- .../scoring/ProvinceProximityScorer.java | 27 ++- 21 files changed, 474 insertions(+), 702 deletions(-) diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java index 15859c1..1eb02ee 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java @@ -15,124 +15,39 @@ */ package opennlp.addons.geoentitylinker; -import java.util.Objects; - /** * Stores an admin boundary down to the US county level. Only US places from the * USGS Gazetteer will have county level info */ -public class AdminBoundary { +public record AdminBoundary(String countryCode, String countryName, String provinceCode, String provinceName, + String countyCode, String countyName, String countryRegex, String provinceRegex, + String countyRegex) { private static final String NO_DATA_FOUND_VALUE = "NO_DATA_FOUND"; - private final String countryCode; - private final String provinceCode; - private final String provinceName; - private final String countryName; - private final String countyName; - private final String countyCode; - private final String countryRegex; - private final String provinceRegex; - private final String countyRegex; - public AdminBoundary(String countryCode, String countryName, String provinceCode, String provinceName, String countyCode, String countyName, - String countryRegex, String provinceRegex, String countyRegex) { + String countryRegex, String provinceRegex, String countyRegex) { this.countryCode = countryCode; this.provinceCode = provinceCode; this.provinceName = provinceName; this.countryName = countryName; - this.countyName = countyName.equals("") ? NO_DATA_FOUND_VALUE : countyName; - this.countyCode = countyCode.equals("") ? NO_DATA_FOUND_VALUE : countyCode; + this.countyName = countyName.isEmpty() ? NO_DATA_FOUND_VALUE : countyName; + this.countyCode = countyCode.isEmpty() ? NO_DATA_FOUND_VALUE : countyCode; this.countryRegex = countryRegex; this.provinceRegex = provinceRegex; this.countyRegex = countyRegex; } - public String getCountryCode() { - return countryCode; - } - public String getProvCode() { return provinceCode; } - public String getProvinceName() { - return provinceName; - } - - public String getCountryName() { - return countryName; - } - - public String getCountyName() { - return countyName; - } - - public String getCountyCode() { - return countyCode; - } - @Override public String toString() { - return "AdminBoundary{" + "countryCode=" + countryCode + ", provinceCode=" + provinceCode + ", provinceName=" + provinceName + ", countryName=" + countryName + ", countyName=" + countyName + ", countyCode=" + countyCode + '}'; - } - - @Override - public int hashCode() { - int hash = 7; - hash = 11 * hash + Objects.hashCode(this.countryCode); - hash = 11 * hash + Objects.hashCode(this.provinceCode); - hash = 11 * hash + Objects.hashCode(this.provinceName); - hash = 11 * hash + Objects.hashCode(this.countryName); - hash = 11 * hash + Objects.hashCode(this.countyName); - hash = 11 * hash + Objects.hashCode(this.countyCode); - return hash; - } - - @Override - public boolean equals(Object obj) { - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - final AdminBoundary other = (AdminBoundary) obj; - if (!Objects.equals(this.countryCode, other.countryCode)) { - return false; - } - if (!Objects.equals(this.provinceCode, other.provinceCode)) { - return false; - } - if (!Objects.equals(this.provinceName, other.provinceName)) { - return false; - } - if (!Objects.equals(this.countryName, other.countryName)) { - return false; - } - if (!Objects.equals(this.countyName, other.countyName)) { - return false; - } - if (!Objects.equals(this.countyCode, other.countyCode)) { - return false; - } - return true; - } - - public String getProvinceCode() { - return provinceCode; - } - - public String getCountryRegex() { - return countryRegex; - } - - public String getProvinceRegex() { - return provinceRegex; - } - - public String getCountyRegex() { - return countyRegex; + return "AdminBoundary{" + "countryCode=" + countryCode + "," + + "provinceCode=" + provinceCode + ", provinceName=" + provinceName + ", " + + "countryName=" + countryName + ", countyName=" + countyName + ", " + + "countyCode=" + countyCode + '}'; } } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java index 85553c3..a78478e 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java @@ -22,6 +22,7 @@ import java.util.Set; public class AdminBoundaryContext { + private static final String REGEX = ".*rg[0-9].*"; private final Map<String, Set<Integer>> countryMentions; private final Map<String, Set<Integer>> provMentions; private final Map<String, Set<Integer>> countyMentions; @@ -103,6 +104,18 @@ public class AdminBoundaryContext { return countyRefMap; } + public Map<String, String> getCountryRegexMap() { + return countryRegexMap; + } + + public Map<String, String> getCountyRegexMap() { + return countyRegexMap; + } + + public Map<String, String> getProvinceRegexMap() { + return provinceRegexMap; + } + public Set<String> getWhereClauses() { return whereClauses; } @@ -111,7 +124,7 @@ public class AdminBoundaryContext { Set<String> clauses = new HashSet<>(); for (String countryCode : this.getCountryHits()) { String gazType = countryCode.equalsIgnoreCase("us") ? " AND gazsource:usgs" : " AND gazsource:geonames"; - if (countryCode.toLowerCase().matches(".*rg[0-9].*")) { + if (countryCode.toLowerCase().matches(REGEX)) { gazType = " AND gazsource:region"; } Map<String, String> provsForCountry = this.getProvRefMap().get(countryCode); @@ -138,16 +151,4 @@ public class AdminBoundaryContext { return clauses; } - public Map<String, String> getCountryRegexMap() { - return countryRegexMap; - } - - public Map<String, String> getCountyRegexMap() { - return countyRegexMap; - } - - public Map<String, String> getProvinceRegexMap() { - return provinceRegexMap; - } - } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java index 166cfca..71e00a5 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java @@ -27,17 +27,17 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.logging.Level; - import java.util.regex.Matcher; import java.util.regex.Pattern; -import opennlp.tools.entitylinker.EntityLinkerProperties; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import opennlp.tools.entitylinker.EntityLinkerProperties; + /** * Finds instances of country mentions in a String, typically a document text. * Used to boost or degrade scoring of linked geo entities - * */ public class AdminBoundaryContextGenerator { @@ -46,23 +46,13 @@ public class AdminBoundaryContextGenerator { private Map<String, Set<String>> nameCodesMap = new HashMap<>(); private final Map<String, Set<Integer>> countryMentions = new HashMap<>(); - Map<String, String> countryRegexMap = new HashMap<>(); - Map<String, String> provinceRegexMap = new HashMap<>(); - Map<String, String> countyRegexMap = new HashMap<>(); + private final Map<String, String> countryRegexMap = new HashMap<>(); + private final Map<String, String> provinceRegexMap = new HashMap<>(); + private Map<String, String> countyRegexMap = new HashMap<>(); private final Set<CountryContextEntry> countryHits = new HashSet<>(); - private final EntityLinkerProperties properties; private final List<AdminBoundary> adminBoundaryData= new ArrayList<>(); private final Set<AdminBoundary> adminBoundaryHits = new HashSet<>(); - private AdminBoundaryContext context; - - public AdminBoundaryContext getContext(String text) { - context = null; - nameCodesMap.clear(); - context = process(text); - - return context; - } private final Set<String> countryHitSet = new HashSet<>(); private final Map<String, String> countryMap = new HashMap<>(); @@ -77,22 +67,31 @@ public class AdminBoundaryContextGenerator { public static void main(String[] args) { try { - AdminBoundaryContextGenerator countryContext - = new AdminBoundaryContextGenerator(new EntityLinkerProperties(new File("C:\\Temp\\gaz_data\\newCountryContextfile.txt"))); + AdminBoundaryContextGenerator countryContext = new AdminBoundaryContextGenerator( + new EntityLinkerProperties(new File("C:\\Temp\\gaz_data\\newCountryContextfile.txt"))); - AdminBoundaryContext c = countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a nice place. Eastern Africa people are cool."); + AdminBoundaryContext c = countryContext.process( + "This article is about fairfax county virginia in the north of florida in the united states. " + + "It is also about Moscow and atlanta. Hillsborough county florida is a nice place. " + + "Eastern Africa people are cool."); System.out.println(c); } catch (Exception ex) { java.util.logging.Logger.getLogger(AdminBoundaryContextGenerator.class.getName()).log(Level.SEVERE, null, ex); } } + public AdminBoundaryContext getContext(String text) { + nameCodesMap.clear(); + return process(text); + } + public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws IOException { - this.properties = properties; if (countrydata == null) { - String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", ""); + String path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", ""); if (path == null || path.trim().isEmpty()) { - throw new IOException("missing country context data configuration. Property opennlp.geoentitylinker.countrycontext.filepath must have a valid path value in entitylinker properties file"); + throw new IOException("missing country context data configuration. " + + "Property opennlp.geoentitylinker.countrycontext.filepath " + + "must have a valid path value in entitylinker properties file"); } File countryContextFile = new File(path); if (countryContextFile == null || !countryContextFile.exists()) { @@ -264,7 +263,7 @@ public class AdminBoundaryContextGenerator { newset.add(start); mentions.put(code, newset); } - if (!hit.equals("")) { + if (!hit.isEmpty()) { if (this.nameCodesMap.containsKey(hit)) { nameCodesMap.get(hit).add(code); } else { @@ -333,40 +332,39 @@ public class AdminBoundaryContextGenerator { private void loadMaps(List<AdminBoundary> boundaries) { for (AdminBoundary adm : boundaries) { - if (!adm.getCountryCode().equals("null")) { - countryMap.put(adm.getCountryCode(), adm.getCountryName()); - if (countryRegexMap.containsKey(adm.getCountryCode())) { - String currentRegex = countryRegexMap.get(adm.getCountryCode()); - if (currentRegex.length() > adm.getCountryRegex().length()) { + if (!adm.countryCode().equals("null")) { + countryMap.put(adm.countryCode(), adm.countryName()); + if (countryRegexMap.containsKey(adm.countryCode())) { + String currentRegex = countryRegexMap.get(adm.countryCode()); + if (currentRegex.length() > adm.countryRegex().length()) { // the longest one wins if they are not all the same for each entry in the file - countryRegexMap.put(adm.getCountryCode(), currentRegex); + countryRegexMap.put(adm.countryCode(), currentRegex); }//else do nothing } else { - countryRegexMap.put(adm.getCountryCode(), adm.getCountryRegex()); + countryRegexMap.put(adm.countryCode(), adm.countryRegex()); } if (!adm.getProvCode().equals("null")) { - Map<String, String> provs = provMap.get(adm.getCountryCode()); + Map<String, String> provs = provMap.get(adm.countryCode()); if (provs == null) { provs = new HashMap<>(); } //if (!provs.containsKey(adm.getProvCode())) { - provs.put(adm.getCountryCode() + "." + adm.getProvCode(), adm.getProvinceName()); - provMap.put(adm.getCountryCode(), provs); + provs.put(adm.countryCode() + "." + adm.getProvCode(), adm.provinceName()); + provMap.put(adm.countryCode(), provs); // } - if (!adm.getCountyCode().equalsIgnoreCase("no_data_found") && !adm.getCountyName().equalsIgnoreCase("no_data_found")) { - Map<String, String> counties = countyMap.get(adm.getCountryCode() + "." + adm.getProvCode()); + if (!adm.countyCode().equalsIgnoreCase("no_data_found") && !adm.countyName().equalsIgnoreCase("no_data_found")) { + Map<String, String> counties = countyMap.get(adm.countryCode() + "." + adm.getProvCode()); if (counties == null) { counties = new HashMap<>(); - } // if (!counties.containsKey(adm.getCountyCode())) { - String countyid = adm.getCountryCode() + "." + adm.getProvCode() + "." + adm.getCountyCode(); - counties.put(countyid, adm.getCountyName()); - countyMap.put(adm.getCountryCode() + "." + adm.getProvCode(), counties); + } + // if (!counties.containsKey(adm.getCountyCode())) { + String countyid = adm.countryCode() + "." + adm.getProvCode() + "." + adm.countyCode(); + counties.put(countyid, adm.countyName()); + countyMap.put(adm.countryCode() + "." + adm.getProvCode(), counties); // } - } - } } } @@ -375,18 +373,17 @@ public class AdminBoundaryContextGenerator { } private void fillProvRegexMap() { - this.provinceRegexMap = new HashMap<>(); // this.adminBoundaryData for (AdminBoundary adm : adminBoundaryData) { if (provinceRegexMap.containsKey(adm.getProvCode())) { String currentRegex = provinceRegexMap.get(adm.getProvCode()); - if (currentRegex.length() > adm.getProvinceRegex().length()) { + if (currentRegex.length() > adm.provinceRegex().length()) { // the longest one wins if they are not all the same for each entry in the file provinceRegexMap.put(adm.getProvCode(), currentRegex); }//else do nothing } else { - provinceRegexMap.put(adm.getProvCode(), adm.getProvinceRegex()); + provinceRegexMap.put(adm.getProvCode(), adm.provinceRegex()); } } } @@ -396,14 +393,14 @@ public class AdminBoundaryContextGenerator { // this.adminBoundaryData for (AdminBoundary adm : adminBoundaryData) { - if (countyRegexMap.containsKey(adm.getCountyCode())) { - String currentRegex = countyRegexMap.get(adm.getCountyCode()); - if (currentRegex.length() > adm.getCountyRegex().length()) { + if (countyRegexMap.containsKey(adm.countyCode())) { + String currentRegex = countyRegexMap.get(adm.countyCode()); + if (currentRegex.length() > adm.countyRegex().length()) { // the longest one wins if they are not all the same for each entry in the file - countyRegexMap.put(adm.getCountyCode(), currentRegex); + countyRegexMap.put(adm.countyCode(), currentRegex); }//else do nothing } else { - countyRegexMap.put(adm.getCountyCode(), adm.getCountyRegex()); + countyRegexMap.put(adm.countyCode(), adm.countyRegex()); } } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java index 04039e9..d04073d 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java @@ -15,104 +15,11 @@ */ package opennlp.addons.geoentitylinker; -import java.util.Objects; - /** - *Stores a tuple from the opennlp.geoentitylinker.countrycontext.txt file, which is used to find country mentions in document text. - * + * Stores a tuple from the {@code opennlp.geoentitylinker.countrycontext.txt} file. + * It is used to find country mentions in document text. */ -public class CountryContextEntry { - /* - * rc,cc1, full_name_nd_ro,dsg - */ - - private String rc; - private String cc1; - private String full_name_nd_ro; - private String dsg; - private String provCode; - public CountryContextEntry() { - } - - public CountryContextEntry(String rc, String cc1, String full_name_nd_ro, String dsg) { - this.rc = rc; - this.cc1 = cc1; - this.full_name_nd_ro = full_name_nd_ro; - this.dsg = dsg; - } - - public String getProvCode() { - return provCode; - } - - public void setProvCode(String provCode) { - this.provCode = provCode; - } - - public String getRc() { - return rc; - } - - public void setRc(String rc) { - this.rc = rc; - } - - public String getCc1() { - return cc1; - } - - public void setCc1(String cc1) { - this.cc1 = cc1; - } - - public String getFull_name_nd_ro() { - return full_name_nd_ro; - } - - public void setFull_name_nd_ro(String full_name_nd_ro) { - this.full_name_nd_ro = full_name_nd_ro; - } - - public String getDsg() { - return dsg; - } - - public void setDsg(String dsg) { - this.dsg = dsg; - } - - @Override - public int hashCode() { - int hash = 7; - hash = 17 * hash + Objects.hashCode(this.rc); - hash = 17 * hash + Objects.hashCode(this.cc1); - hash = 17 * hash + Objects.hashCode(this.full_name_nd_ro); - hash = 17 * hash + Objects.hashCode(this.dsg); - return hash; - } +public record CountryContextEntry(String rc, String cc1, String full_name_nd_ro, + String dsg, String provCode) { - @Override - public boolean equals(Object obj) { - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - final CountryContextEntry other = (CountryContextEntry) obj; - if (!Objects.equals(this.rc, other.rc)) { - return false; - } - if (!Objects.equals(this.cc1, other.cc1)) { - return false; - } - if (!Objects.equals(this.full_name_nd_ro, other.full_name_nd_ro)) { - return false; - } - if (!Objects.equals(this.dsg, other.dsg)) { - return false; - } - return true; - } - } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java index b7ef387..86fc0ea 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java @@ -18,10 +18,11 @@ package opennlp.addons.geoentitylinker; import java.util.HashMap; import java.util.Map; import java.util.Objects; + import opennlp.tools.entitylinker.BaseLink; /** - * Stores a minimal amount of information from a geographic placenames gazetteer. + * Stores a minimal amount of information from a geographic place names gazetteer. */ public class GazetteerEntry extends BaseLink { @@ -29,10 +30,11 @@ public class GazetteerEntry extends BaseLink { private Double longitude; private String source; private String indexID; - private Map<String, String> indexData = new HashMap<>(); private String countryCode; private String provinceCode; private String hierarchy; + + private Map<String, String> indexData = new HashMap<>(); public GazetteerEntry(String parentID, String itemID, String itemName, String itemType) { super(parentID, itemID, itemName, itemType); @@ -44,9 +46,7 @@ public class GazetteerEntry extends BaseLink { public String getIndexID() { return indexID; } - /** - * sets the id from the lucene document - */ + public void setIndexID(String indexID) { this.indexID = indexID; } @@ -58,9 +58,6 @@ public class GazetteerEntry extends BaseLink { return latitude; } - /** - * sets the latitude from the gazetteer - */ public void setLatitude(Double latitude) { this.latitude = latitude; } @@ -72,11 +69,6 @@ public class GazetteerEntry extends BaseLink { return longitude; } - /** - * sets the longitude from the gaz - * - * @param longitude - */ public void setLongitude(Double longitude) { this.longitude = longitude; } @@ -88,11 +80,6 @@ public class GazetteerEntry extends BaseLink { return source; } - /** - * sets the source (the source of the gazetteer data) - * - * @param source - */ public void setSource(String source) { this.source = source; } @@ -113,10 +100,36 @@ public class GazetteerEntry extends BaseLink { this.indexData = indexData; } + public String getCountryCode() { + return countryCode; + } + + public void setCountryCode(String countryCode) { + this.countryCode = countryCode; + } + + public String getProvinceCode() { + return provinceCode; + } + + public void setProvinceCode(String provinceCode) { + this.provinceCode = provinceCode; + } + + public String getHierarchy() { + return hierarchy; + } + + public void setHierarchy(String hierarchy) { + this.hierarchy = hierarchy; + } + @Override public String toString() { - return super.toString() + "\n\t\tGazateerEntry\n" + "\t\tlatitude=" + latitude + ", \n\t\tlongitude=" + longitude + ", \n\t\tsource=" + source + ", \n\t\tindexID=" + indexID + ",\n\t\tindexData=" + indexData + "\n"; + return super.toString() + "\n\t\tGazateerEntry\n" + "\t\tlatitude=" + + latitude + ", \n\t\tlongitude=" + longitude + ", \n\t\tsource=" + + source + ", \n\t\tindexID=" + indexID + ",\n\t\tindexData=" + indexData + "\n"; } @Override @@ -144,30 +157,4 @@ public class GazetteerEntry extends BaseLink { } return true; } - - - public String getCountryCode() { - return countryCode; - } - - public void setCountryCode(String countryCode) { - this.countryCode = countryCode; - } - - public String getProvinceCode() { - return provinceCode; - } - - public void setProvinceCode(String provinceCode) { - this.provinceCode = provinceCode; - } - - public String getHierarchy() { - return hierarchy; - } - - public void setHierarchy(String hierarchy) { - this.hierarchy = hierarchy; - } - } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java index 7d13c81..9cdca41 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java @@ -20,30 +20,28 @@ import java.util.HashMap; import java.util.Map; /** - * - * Caches gazateer query results statically. Clears itself if more than 10000 results are cached. + * Caches {@link GazetteerEntry gazetteer} query results statically. + * Clears itself if more than 10000 results are cached. */ public class GazetteerSearchCache { - private static Map<String, ArrayList<GazetteerEntry>> gazCache = new HashMap<>(); + private static final Map<String, ArrayList<GazetteerEntry>> CACHE = new HashMap<>(); -/** - * returns the cached entries. Returns null if the query does not exist in the cache - * @param searchString - * @return - */ + /** + * @param searchString The search string to check for matching entries. + * @return Retrieves cached entries. Returns {@code null} if the query does not exist in the cache. + */ public static synchronized ArrayList<GazetteerEntry> get(String searchString) { - return gazCache.get(searchString); + return CACHE.get(searchString); } public static synchronized void put(String searchString, ArrayList<GazetteerEntry> hits) { - if (gazCache.size() > 10000) { - gazCache.clear(); + if (CACHE.size() > 10000) { + CACHE.clear(); } - if (!gazCache.containsKey(searchString)) { - gazCache.put(searchString, hits); + if (!CACHE.containsKey(searchString)) { + CACHE.put(searchString, hits); } } - } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java index 4f5ad5d..804432a 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java @@ -52,8 +52,8 @@ import org.slf4j.LoggerFactory; */ public class GazetteerSearcher { - private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]"; private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]"; private double scoreCutoff = .70; private final boolean doubleQuoteAllSearchTerms = false; private boolean useHierarchyField = false; @@ -61,14 +61,13 @@ public class GazetteerSearcher { private final EntityLinkerProperties properties; private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc)); - private IndexReader opennlpReader;// = DirectoryReader.open(geonamesIndex); private IndexSearcher opennlpSearcher;// = new IndexSearcher(geonamesReader); private Analyzer opennlpAnalyzer; public static void main(String[] args) { try { - boolean b = true; - new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).find("alabama", 5, " countrycode:us AND gazsource:usgs"); + new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))) + .find("alabama", 5, " countrycode:us AND gazsource:usgs"); } catch (IOException ex) { LOG.error(ex.getLocalizedMessage(), ex); } @@ -84,11 +83,11 @@ public class GazetteerSearcher { * * @param searchString the location name to search for * @param rowsReturned how many index entries to return (top N...) - * @param whereClause the conditional statement that defines the index type - * and the country oode. + * @param whereClause the conditional statement that defines the i + * ndex type and the country code. * @return */ - public ArrayList<GazetteerEntry> find(String searchString, int rowsReturned, String whereClause) { + public List<GazetteerEntry> find(String searchString, int rowsReturned, String whereClause) { ArrayList<GazetteerEntry> linkedData = new ArrayList<>(); searchString = cleanInput(searchString); if (searchString.isEmpty()) { @@ -199,12 +198,13 @@ public class GazetteerSearcher { if (opennlpIndex == null) { String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", ""); - if (indexloc.equals("")) { + if (indexloc.isEmpty()) { LOG.error("Opennlp combined Gaz directory location not found!"); } opennlpIndex = new MMapDirectory(Paths.get(indexloc)); - opennlpReader = DirectoryReader.open(opennlpIndex); + // = DirectoryReader.open(geonamesIndex); + IndexReader opennlpReader = DirectoryReader.open(opennlpIndex); opennlpSearcher = new IndexSearcher(opennlpReader); opennlpAnalyzer = //new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true)); @@ -221,7 +221,7 @@ public class GazetteerSearcher { = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap); String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff)); - String usehierarchy = properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", String.valueOf("0")); + String usehierarchy = properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", "0"); if (cutoff != null && !cutoff.isEmpty()) { scoreCutoff = Double.parseDouble(cutoff); } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java index 7fa49be..769bbbc 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java @@ -16,17 +16,17 @@ package opennlp.addons.geoentitylinker; import java.io.IOException; -import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer; -import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer; -import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer; -import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer; -import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; +import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer; +import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer; +import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer; +import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer; +import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer; import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer; import opennlp.addons.geoentitylinker.scoring.ProvinceProximityScorer; import opennlp.tools.entitylinker.BaseLink; @@ -41,29 +41,24 @@ import opennlp.tools.entitylinker.EntityLinker; * indexes. The indexes can be built using the GeoEntityLinkerSetupUtils class * in this same package. */ -public class GeoEntityLinker implements EntityLinker<LinkedSpan> { +public class GeoEntityLinker implements EntityLinker<LinkedSpan<BaseLink>> { private static Integer topN = 2; private AdminBoundaryContextGenerator countryContext; private EntityLinkerProperties linkerProperties; - private GazetteerSearcher gazateerSearcher; - private final List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new ArrayList<>(); + private GazetteerSearcher gazetteerSearcher; + private final List<LinkedEntityScorer<? extends BaseLink, AdminBoundaryContext>> scorers = new ArrayList<>(); @Override - public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] tokensBySentence, Span[][] namesBySentence) { - ArrayList<LinkedSpan> spans = new ArrayList<>(); - - if (linkerProperties == null) { - throw new IllegalArgumentException("EntityLinkerProperties cannot be null"); - } + public List<LinkedSpan<BaseLink>> find(String doctext, Span[] sentences, + Span[][] tokensBySentence, Span[][] namesBySentence) { + List<LinkedSpan<BaseLink>> spans = new ArrayList<>(); //countryMentions = countryContext.regexfind(doctext); AdminBoundaryContext context = countryContext.getContext(doctext); for (int s = 0; s < sentences.length; s++) { Span[] names = namesBySentence[s]; - Span[] tokenSpans = tokensBySentence[s]; String[] tokens = Span.spansToStrings(tokenSpans, sentences[s].getCoveredText(doctext)); - String[] matches = Span.spansToStrings(names, tokens); for (int i = 0; i < matches.length; i++) { @@ -71,16 +66,15 @@ public class GeoEntityLinker implements EntityLinker<LinkedSpan> { ArrayList<BaseLink> geoNamesEntries = new ArrayList<>(); if (!context.getWhereClauses().isEmpty()) { for (String whereclause : context.getWhereClauses()) { - ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], topN, whereclause); + List<GazetteerEntry> find = gazetteerSearcher.find(matches[i], topN, whereclause); for (GazetteerEntry gazetteerEntry : find) { if (!geoNamesEntries.contains(gazetteerEntry)) { geoNamesEntries.add(gazetteerEntry); } } - } - } else {//this means there were no where clauses generated so the where clause will default to look at the entire index - ArrayList<GazetteerEntry> find = gazateerSearcher.find(matches[i], topN, " gaztype:usgs geonames regions "); + } else { //this means there were no where clauses generated so the where clause will default to look at the entire index + List<GazetteerEntry> find = gazetteerSearcher.find(matches[i], topN, " gaztype:usgs geonames regions "); for (GazetteerEntry gazetteerEntry : find) { if (!geoNamesEntries.contains(gazetteerEntry)) { geoNamesEntries.add(gazetteerEntry); @@ -96,7 +90,7 @@ public class GeoEntityLinker implements EntityLinker<LinkedSpan> { */ if (!spans.isEmpty()) { - Double maxscore = 0d; + double maxscore = 0d; for (BaseLink gazetteerEntry : geoNamesEntries) { Double deNormScore = gazetteerEntry.getScoreMap().get("lucene"); if (deNormScore.compareTo(maxscore) > 0) { @@ -115,20 +109,22 @@ public class GeoEntityLinker implements EntityLinker<LinkedSpan> { newspan.setSentenceid(s); spans.add(newspan); } - } if (!scorers.isEmpty()) { - for (LinkedEntityScorer scorer : scorers) { - scorer.score(spans, doctext, sentences, linkerProperties, context); + for (LinkedEntityScorer<? extends BaseLink, AdminBoundaryContext> scorer : scorers) { + @SuppressWarnings("rawtypes") + LinkedEntityScorer<BaseLink, AdminBoundaryContext> s = (LinkedEntityScorer) scorer; + s.score(spans, doctext, sentences, linkerProperties, context); } } + /* * sort the data with the best score on top based on the sum of the scores - * below from the score map for each baselink object + * below from the score map for each BaseLink object */ - for (LinkedSpan<BaseLink> s : spans) { - ArrayList<BaseLink> linkedData = s.getLinkedEntries(); + for (LinkedSpan<? extends BaseLink> s : spans) { + ArrayList<? extends BaseLink> linkedData = s.getLinkedEntries(); linkedData.sort(Collections.reverseOrder((o1, o2) -> { Map<String, Double> o1scoreMap = o1.getScoreMap(); Map<String, Double> o2scoreMap = o2.getScoreMap(); @@ -149,11 +145,10 @@ public class GeoEntityLinker implements EntityLinker<LinkedSpan> { } } - return Double.compare(sumo1, - sumo2); + return Double.compare(sumo1, sumo2); })); //prune the list to topN - Iterator<BaseLink> iterator = linkedData.iterator(); + Iterator<? extends BaseLink> iterator = linkedData.iterator(); int n = 0; while (iterator.hasNext()) { if (n >= topN) { @@ -174,12 +169,11 @@ public class GeoEntityLinker implements EntityLinker<LinkedSpan> { * @param valueToNormalize the value to place within the new range * @param minimum the min of the set to be transposed * @param maximum the max of the set to be transposed - * @return + * @return The value of the normalized distance. */ private Double normalize(Double valueToNormalize, double minimum, double maximum) { double d = ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0; - d = Double.isNaN(d) ? 0d : d; - return d; + return Double.isNaN(d) ? 0d : d; } private void loadScorers() { @@ -195,10 +189,12 @@ public class GeoEntityLinker implements EntityLinker<LinkedSpan> { @Override public void init(EntityLinkerProperties properties) throws IOException { - + if (properties == null) { + throw new IllegalArgumentException("EntityLinkerProperties cannot be null"); + } this.linkerProperties = properties; countryContext = new AdminBoundaryContextGenerator(this.linkerProperties); - gazateerSearcher = new GazetteerSearcher(this.linkerProperties); + gazetteerSearcher = new GazetteerSearcher(this.linkerProperties); String rowsRetStr = this.linkerProperties.getProperty("opennlp.geoentitylinker.gaz.rowsreturned", "2"); int rws; try { @@ -208,12 +204,11 @@ public class GeoEntityLinker implements EntityLinker<LinkedSpan> { } topN = rws; loadScorers(); - } @Override - public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] tokensBySentence, - Span[][] namesBySentence, int sentenceIndex) { + public List<LinkedSpan<BaseLink>> find(String doctext, Span[] sentences, Span[][] tokensBySentence, + Span[][] namesBySentence, int sentenceIndex) { throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document " + "for proper scoring. This method is unsupported"); } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java index d41b6ff..d178a44 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java @@ -39,42 +39,7 @@ import org.apache.lucene.store.MMapDirectory; */ public class GazetteerIndexer { - public static void main(String[] args) { - - if (args.length != 8) { - System.out.println("Usage: GazetteerIndexer geonamesData geoNamesCountryInfo geonamesAdmin1CodesASCII " - + "usgsDataFile usgsGovUnitsFile outputIndexDir outputCountryContextFile regionsFile"); - System.out.println(); - System.out.println("The GazetteerIndexer.index methods javadoc explains how to retrieve the data files."); - return; - } - - File geonamesData = new File(args[0]); - File geoNamesCountryInfo = new File(args[1]); - File geonamesAdmin1CodesASCII = new File(args[2]); - File usgsDataFile = new File(args[3]); - File usgsGovUnitsFile = new File(args[4]); - File outputIndexDir = new File(args[5]); - File outputCountryContextFile = new File(args[6]); - File regionsFile = new File(args[7]); - - try { - GazetteerIndexer i = new GazetteerIndexer(); - i.index(geonamesData, - geoNamesCountryInfo, - geonamesAdmin1CodesASCII, - usgsDataFile, - usgsGovUnitsFile, - outputIndexDir, - outputCountryContextFile, - regionsFile); - } catch (Exception ex) { - ex.printStackTrace(); - } - } - public GazetteerIndexer() { - } public interface Separable { @@ -83,7 +48,6 @@ public class GazetteerIndexer { } public enum GazType implements Separable { - GEONAMES { @Override public String toString() { @@ -144,11 +108,13 @@ public class GazetteerIndexer { * format: tab delimited text with index 0 as the name of the region, index 1 * as the longitude, and index 2 as the latitude * - * @throws Exception + * @throws IOException Thrown if IO errors occurred. + * @throws FileNotFoundException Thrown if required resources do not exist. + * @throws IllegalArgumentException Thrown if parameters are invalid. */ public void index(File geonamesData, File geoNamesCountryInfo, File geonamesAdmin1CodesASCII, File usgsDataFile, File usgsGovUnitsFile, File outputIndexDir, File outputCountryContextFile, File regionsFile) - throws Exception { + throws IOException { if (!outputIndexDir.isDirectory()) { throw new IllegalArgumentException("outputIndexDir must be a directory."); } @@ -161,7 +127,6 @@ public class GazetteerIndexer { if (!geonamesAdmin1CodesASCII.exists()) { throw new FileNotFoundException("geonamesAdmin1CodesASCII data file does not exist"); } - if (!usgsDataFile.exists()) { throw new FileNotFoundException("usgsDataFile data file does not exist"); } @@ -190,15 +155,13 @@ public class GazetteerIndexer { IndexWriterConfig config = new IndexWriterConfig(aWrapper); try (IndexWriter w = new IndexWriter(index, config)) { //write the column headers for the countryContextFile - try (FileWriter countryContextFileWriter = new FileWriter(outputCountryContextFile, false)) { + try (FileWriter writer = new FileWriter(outputCountryContextFile, false)) { String colNamesForCountryContextFile = "countrycode\tprovcode\tcountycode\tcountryname\tprovincename\tcountyname\tcountryregex\tprovregex\tcountyregex\n"; - countryContextFileWriter.write(colNamesForCountryContextFile); - countryContextFileWriter.flush(); + writer.write(colNamesForCountryContextFile); + writer.flush(); } - USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, outputCountryContextFile, w); GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII, geonamesData, outputCountryContextFile, w); - RegionProcessor.process(regionsFile, outputCountryContextFile, w); w.commit(); } @@ -207,4 +170,38 @@ public class GazetteerIndexer { outputCountryContextFile.getPath() + "' to entitylinker.properties file"); } + + public static void main(String[] args) { + + if (args.length != 8) { + System.out.println("Usage: GazetteerIndexer geonamesData geoNamesCountryInfo geonamesAdmin1CodesASCII " + + "usgsDataFile usgsGovUnitsFile outputIndexDir outputCountryContextFile regionsFile"); + System.out.println(); + System.out.println("The GazetteerIndexer.index methods javadoc explains how to retrieve the data files."); + return; + } + + File geonamesData = new File(args[0]); + File geoNamesCountryInfo = new File(args[1]); + File geonamesAdmin1CodesASCII = new File(args[2]); + File usgsDataFile = new File(args[3]); + File usgsGovUnitsFile = new File(args[4]); + File outputIndexDir = new File(args[5]); + File outputCountryContextFile = new File(args[6]); + File regionsFile = new File(args[7]); + + try { + GazetteerIndexer i = new GazetteerIndexer(); + i.index(geonamesData, + geoNamesCountryInfo, + geonamesAdmin1CodesASCII, + usgsDataFile, + usgsGovUnitsFile, + outputIndexDir, + outputCountryContextFile, + regionsFile); + } catch (Exception ex) { + ex.printStackTrace(); + } + } } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java index 58d680c..df010ed 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java @@ -38,7 +38,6 @@ public class GeonamesFileDownloader { public static void downloadGeonamesFiles(String outputFileName, String outputDir) { String fileDownload = fileDownload(ALL_COUNTRIES, outputDir); - unzipMyZip(fileDownload, outputDir); fileDownload(COUNTRY_INFO, outputDir); @@ -46,19 +45,6 @@ public class GeonamesFileDownloader { } - public static void writeFile(InputStream in, OutputStream out) - throws IOException { - byte[] buffer = new byte[1024]; - int len; - - while ((len = in.read(buffer)) != 0) { - out.write(buffer, 0, len); - } - - in.close(); - out.close(); - } - public static void unzipMyZip(String zipFileName, String directoryToExtractTo) { Enumeration<? extends ZipEntry> entriesEnum; ZipFile zip; diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java index c58c92d..0553e3c 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java @@ -21,52 +21,58 @@ import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; -import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; -import opennlp.addons.geoentitylinker.AdminBoundary; + import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; +import opennlp.addons.geoentitylinker.AdminBoundary; + public class GeonamesProcessor { - public static void process(File countryCodesLookupFile, File adm1CodesLookupFile, File geonamesGazetteerFile, File outputCountryContextFile, IndexWriter w) throws Exception { - Map<String, String> countryCodes = getCountryCodes(countryCodesLookupFile); + private static final String REGEX_NUMBERS = "[0-9].*"; + private static final String TAB = "\t"; + + private static final String[] BOOSTS = ( + "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD ADMDH " + + "PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 " + + "PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT").split(" "); - Map<String, AdminBoundary> adm1s = getProvData(adm1CodesLookupFile, countryCodes); + public GeonamesProcessor() { + } + + public static void process(File countryCodesLookup, File adm1CodesLookup, + File geonamesGazetteer, File outputCountryContext, IndexWriter w) throws IOException { + Map<String, String> countryCodes = getCountryCodes(countryCodesLookup); + + Map<String, AdminBoundary> adm1s = getProvData(adm1CodesLookup, countryCodes); // List<AdminBoundary> adm2s = getCountryContextFromFile(new File("C:\\temp\\gazetteers\\geonamesdata\\admin2Codes.txt")); //admin2Codes.txt - readFile(geonamesGazetteerFile, GazetteerIndexer.GazType.GEONAMES, adm1s, countryCodes, w); - //now append to the coutnry context file - writeCountryContextFile(outputCountryContextFile, adm1s); + readFile(geonamesGazetteer, GazetteerIndexer.GazType.GEONAMES, adm1s, countryCodes, w); + //now append to the country context file + writeCountryContextFile(outputCountryContext, adm1s); } - public GeonamesProcessor() { - } - private static Map<String, AdminBoundary> getProvData(File adm1CodesLookupFile, Map<String, String> ccodes) { System.out.println("Attempting to read geonames province data from: " + adm1CodesLookupFile.getPath()); Map<String, AdminBoundary> outmap = new HashMap<>(); - BufferedReader reader; Set<String> nullcodes = new HashSet<>(); - try { - - reader = new BufferedReader(new FileReader(adm1CodesLookupFile)); - int i = 0; + try (BufferedReader reader = new BufferedReader(new FileReader(adm1CodesLookupFile))){ String line; while ((line = reader.readLine()) != null) { // String line = reader.readLine(); - String[] values = line.split("\t"); + String[] values = line.split(TAB); if (values.length != 4) { throw new IOException("improperly formatted province lookup file"); } @@ -82,7 +88,7 @@ public class GeonamesProcessor { String pname = values[2]; - if (ccode.matches("[0-9].*")) { + if (ccode.matches(REGEX_NUMBERS)) { String code = ccode; ccode = pcode; pcode = code; @@ -99,7 +105,6 @@ public class GeonamesProcessor { } System.out.println("INFO: there were " + nullcodes.size() + " null prov codes. This is due to inconsistencies in reference data."); - reader.close(); } catch (IOException ex) { ex.printStackTrace(); } @@ -111,30 +116,24 @@ public class GeonamesProcessor { private static Map<String, String> getCountryCodes(File countryContextFile) { Map<String, String> ccs = new HashMap<>(); - BufferedReader reader; - try { - - reader = new BufferedReader(new FileReader(countryContextFile)); - int i = 0; + try (BufferedReader reader = new BufferedReader(new FileReader(countryContextFile))){ String line; boolean start = false; while ((line = reader.readLine()) != null) { if (!line.toLowerCase().startsWith("#iso\t") && !start) { - continue; } else { start = true; } - String[] values = line.split("\t"); + String[] values = line.split(TAB); String ccode = values[0].toLowerCase();//this is the 2 digit ISO code String cname = values[4].toLowerCase(); - if (!ccode.equals("")) { + if (!ccode.isEmpty()) { ccs.put(ccode, cname); } } - reader.close(); } catch (IOException ex) { ex.printStackTrace(); } @@ -145,23 +144,18 @@ public class GeonamesProcessor { } public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) { - // FileWriter writer = null; - try (FileWriter writer = new FileWriter(outfile, true)) { - BufferedWriter bw = new BufferedWriter(writer); - + try (BufferedWriter bw = new BufferedWriter(new FileWriter(outfile, true))) { for (String admKey : adms.keySet()) { AdminBoundary adm = adms.get(admKey); if (adm == null) { continue; } - String province = adm.getProvinceName(); - String country = adm.getCountryName(); + String province = adm.provinceName(); + String country = adm.countryName(); - String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + "\t" + country + "\t" + province + "\t" + "\t" + "(" + country + ")" + "\t" - + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n"; + String line = adm.countryCode() + TAB + adm.getProvCode() + TAB + TAB + country + TAB + province + + TAB + TAB + "(" + country + ")" + TAB + adm.provinceName() + TAB + adm.countyName() + "\n"; bw.write(line); - // System.out.println(line); - } } catch (IOException ex) { @@ -177,119 +171,104 @@ public class GeonamesProcessor { * @param adms the province info * @param countrycodes the country code info * @param w the lucene index writer - * @throws Exception + * + * @throws IOException Thrown if IO errors occurred. */ - public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, IndexWriter w) throws Exception { + public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type, + Map<String, AdminBoundary> adms, Map<String, String> countrycodes, + IndexWriter w) throws IOException { - BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData)); - String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT".split(" "); - Map<String, Float> boostMap = new HashMap<>(); - for (String boost : boosts) { + final Map<String, Float> boostMap = new HashMap<>(); + for (String boost : BOOSTS) { boostMap.put(boost.toLowerCase(), 10f); } - String[] fieldStrings = new String[]{ - "geonameid", - "name", - "asciiname", - "alternatenames", - "latitude", - "longitude", - "feature_class", - "feature_code", - "country code", - "cc2", - "admin1_code", - "admin2_code", - "admin3_code", - "admin4_code", - "population", - "elevation", - "dem ", - "timezone", - "modification_date"}; - - List<String> fields = Arrays.asList(fieldStrings); + final List<String> fields = List.of("geonameid", "name", "asciiname", "alternatenames", + "latitude", "longitude", "feature_class", "feature_code", "country code", "cc2", + "admin1_code", "admin2_code", "admin3_code", "admin4_code", "population", + "elevation", "dem ", "timezone", "modification_date"); + int counter = 0; System.out.println("reading gazetteer data from file..........."); String line; - while ((line = reader.readLine()) != null) { - String[] values = line.split(type.getSeparator()); - - Document doc = new Document(); - String admincode = values[10].toLowerCase(); - String ccode = values[8].toLowerCase(); - if (ccode.contains(",")) { - String[] codes = ccode.split(","); - if (codes.length > 0) { - ccode = codes[0]; + + try (BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData))) { + while ((line = reader.readLine()) != null) { + String[] values = line.split(type.getSeparator()); + + Document doc = new Document(); + String admincode = values[10].toLowerCase(); + String ccode = values[8].toLowerCase(); + if (ccode.contains(",")) { + String[] codes = ccode.split(","); + if (codes.length > 0) { + ccode = codes[0]; + } } - } - AdminBoundary adm = adms.get(ccode + "." + admincode); - - String placeName = values[2]; - String lat = values[4]; - String lon = values[5]; - String dsg = values[7].toLowerCase(); - - String id = values[0]; - String concatIndexEntry; - String countryname; - if (adm != null) { - concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() + ", " + placeName; - countryname = adm.getCountryName(); - } else { - //there is no admin info, but we can still use the countrycode to concat the country name - String n = countrycodes.get(ccode); - countryname = n; - if (n != null) { - concatIndexEntry = n + ", " + placeName; + AdminBoundary adm = adms.get(ccode + "." + admincode); + + String placeName = values[2]; + String lat = values[4]; + String lon = values[5]; + String dsg = values[7].toLowerCase(); + + String id = values[0]; + String concatIndexEntry; + String countryname; + if (adm != null) { + concatIndexEntry = adm.countryName() + ", " + adm.provinceName() + ", " + placeName; + countryname = adm.countryName(); } else { - ///don't want a single token hierarchy entry. - concatIndexEntry = ""; + //there is no admin info, but we can still use the countrycode to concat the country name + String n = countrycodes.get(ccode); + countryname = n; + if (n != null) { + concatIndexEntry = n + ", " + placeName; + } else { + ///don't want a single token hierarchy entry. + concatIndexEntry = ""; + } } - } - if (ccode == null) { - System.out.println("naughty country code"); - } - for (int i = 0; i < fields.size() - 1; i++) { - doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES)); + if (ccode == null) { + System.out.println("naughty country code"); + } + for (int i = 0; i < fields.size() - 1; i++) { + doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES)); - } - if (dsg.equals("pcli")) { - System.out.println("placename: " + placeName + " RESET TO: " + countryname); - placeName = countryname; - } - /* - * add standard fields to the index - */ - doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES)); - doc.add(new TextField("placename", placeName, Field.Store.YES)); - // doc.add(new TextField("countryname", countryname, Field.Store.YES)); - //System.out.println(placeName); - - doc.add(new TextField("latitude", lat, Field.Store.YES)); - doc.add(new TextField("longitude", lon, Field.Store.YES)); - doc.add(new StringField("loctype", dsg, Field.Store.YES)); - doc.add(new StringField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES)); - doc.add(new StringField("countrycode", ccode.toLowerCase(), Field.Store.YES)); - doc.add(new StringField("countycode", "", Field.Store.YES)); - doc.add(new StringField("locid", id, Field.Store.YES)); - placeName = placeName.replace("republic of", "").replace("federative", ""); - if (id.equals("3175395")) { - System.out.println(placeName); - } - doc.add(new StringField("gazsource", "geonames", Field.Store.YES)); + } + if (dsg.equals("pcli")) { + System.out.println("placename: " + placeName + " RESET TO: " + countryname); + placeName = countryname; + } + /* + * add standard fields to the index + */ + doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES)); + doc.add(new TextField("placename", placeName, Field.Store.YES)); + // doc.add(new TextField("countryname", countryname, Field.Store.YES)); + //System.out.println(placeName); + + doc.add(new TextField("latitude", lat, Field.Store.YES)); + doc.add(new TextField("longitude", lon, Field.Store.YES)); + doc.add(new StringField("loctype", dsg, Field.Store.YES)); + doc.add(new StringField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES)); + doc.add(new StringField("countrycode", ccode.toLowerCase(), Field.Store.YES)); + doc.add(new StringField("countycode", "", Field.Store.YES)); + doc.add(new StringField("locid", id, Field.Store.YES)); + placeName = placeName.replace("republic of", "").replace("federative", ""); + if (id.equals("3175395")) { + System.out.println(placeName); + } + doc.add(new StringField("gazsource", "geonames", Field.Store.YES)); - w.addDocument(doc); + w.addDocument(doc); - counter++; - if (counter % 100000 == 0) { - w.commit(); - System.out.println(counter + " .........Geonames entries committed to index.............."); + counter++; + if (counter % 100000 == 0) { + w.commit(); + System.out.println(counter + " .........Geonames entries committed to index.............."); + } } - } - System.out.println("Completed indexing geonames gaz! index name is: " + type.toString()); } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java index c6b2d0c..5335c2b 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java @@ -20,8 +20,10 @@ import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; +import java.io.IOException; import java.util.ArrayList; import java.util.List; + import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; @@ -31,13 +33,15 @@ import org.apache.lucene.index.IndexWriter; public class RegionProcessor { public static void main(String[] args) { - RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null); + RegionProcessor.process( + new File("C:\\temp\\gazetteers\\regions.txt"), + new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null); } /** * * @param regionsFile the file that stores Region references. the format of - * this file is tab delimitted text with index 0 as the name of the region, + * this file is tab delimited text with index 0 as the name of the region, * index 1 as the longitude, and index 2 as the latitude * @param outputCountryContextfile this is the country context files shared by * all indexing processors @@ -51,64 +55,65 @@ public class RegionProcessor { } } - public static void readFile(File gazateerInputData, File outputCountryContextfile, IndexWriter w) throws Exception { + public static void readFile(File gazateerInputData, File outputCountryContextfile, + IndexWriter w) throws IOException { List<String> ccfileentries = new ArrayList<>(); - BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData)); List<String> fields = new ArrayList<>(); int counter = 0; System.out.println("reading gazetteer data from Regions file..........."); String line; - while ((line = reader.readLine()) != null) { - - String[] values = line.split("\t"); - if (counter == 0) { + try (BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData))) { + while ((line = reader.readLine()) != null) { + String[] values = line.split("\t"); + if (counter == 0) { - } else { - Document doc = new Document(); - for (int i = 0; i < fields.size() - 1; i++) { - doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES)); - } - String placeName = values[0]; - String lat = values[2]; - String lon = values[1]; - String dsg = "region"; - String id = "rg" + counter; + } else { + Document doc = new Document(); + for (int i = 0; i < fields.size() - 1; i++) { + doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES)); + } + String placeName = values[0]; + String lat = values[2]; + String lon = values[1]; + String dsg = "region"; + String id = "rg" + counter; - String hierarchy = placeName; + String hierarchy = placeName; - doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES)); - doc.add(new TextField("placename", placeName, Field.Store.YES)); - doc.add(new StringField("latitude", lat, Field.Store.YES)); - doc.add(new StringField("longitude", lon, Field.Store.YES)); - doc.add(new StringField("loctype", dsg, Field.Store.YES)); - doc.add(new StringField("admincode", "", Field.Store.YES)); - doc.add(new StringField("countrycode", id, Field.Store.YES)); - doc.add(new StringField("countycode", "", Field.Store.YES)); + doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES)); + doc.add(new TextField("placename", placeName, Field.Store.YES)); + doc.add(new StringField("latitude", lat, Field.Store.YES)); + doc.add(new StringField("longitude", lon, Field.Store.YES)); + doc.add(new StringField("loctype", dsg, Field.Store.YES)); + doc.add(new StringField("admincode", "", Field.Store.YES)); + doc.add(new StringField("countrycode", id, Field.Store.YES)); + doc.add(new StringField("countycode", "", Field.Store.YES)); - doc.add(new StringField("locid", id, Field.Store.YES)); - doc.add(new StringField("gazsource", "region", Field.Store.YES)); - //countrycontext file format - // US KY 131 United States Kentucky Leslie + doc.add(new StringField("locid", id, Field.Store.YES)); + doc.add(new StringField("gazsource", "region", Field.Store.YES)); + //countrycontext file format + // US KY 131 United States Kentucky Leslie - ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\t" + "(" - + placeName + ")" + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\n"); - if (w != null) { - w.addDocument(doc); + ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\t" + "(" + + placeName + ")" + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\n"); + if (w != null) { + w.addDocument(doc); + } } - } - counter++; + counter++; + } + if (w != null) { + w.commit(); + } } - if (w != null) { - w.commit(); - } - FileWriter writer = new FileWriter(outputCountryContextfile, true); - BufferedWriter bw = new BufferedWriter(writer); - for (String string : ccfileentries) { - bw.write(string); + + try (BufferedWriter bw = new BufferedWriter(new FileWriter(outputCountryContextfile, true))) { + for (String string : ccfileentries) { + bw.write(string); + } + System.out.println("successfully wrote Region entries to country oontext file"); } - System.out.println("successfully wrote Region entries to country oontext file"); - bw.close(); System.out.println("Completed indexing regions!"); } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java index 46e6200..ac1e91f 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java @@ -27,16 +27,19 @@ import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; -import opennlp.addons.geoentitylinker.AdminBoundary; + import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; - import org.apache.lucene.index.IndexWriter; +import opennlp.addons.geoentitylinker.AdminBoundary; + public class USGSProcessor { + private static final String TAB = "\t"; + public static void main(String[] args) { try { Map<String, AdminBoundary> provData = getProvData(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), GazetteerIndexer.GazType.USGS); @@ -46,13 +49,15 @@ public class USGSProcessor { } } - public static void process(File lookupData, File usgsGazDataFile, File outputCountryContextfile, IndexWriter w) throws Exception { + public static void process(File lookupData, File usgsGazDataFile, + File outputCountryContextfile, IndexWriter w) throws IOException { Map<String, AdminBoundary> provData = getProvData(lookupData, GazetteerIndexer.GazType.USGS); readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData); writeCountryContextFile(outputCountryContextfile, provData); } - public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws Exception { + public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, + Map<String, AdminBoundary> lookupMap) throws IOException { Map<String, StateCentroid> states = new HashMap<>(); try (BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData))) { @@ -89,18 +94,18 @@ public class USGSProcessor { continue; } - String countyCode = get.getCountyCode(); + String countyCode = get.countyCode(); - if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) { - countyname = get.getCountyName(); + if (!get.countyName().equals("NO_DATA_FOUND_VALUE")) { + countyname = get.countyName(); } - if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) { - countyCode = get.getCountyCode(); + if (!get.countyCode().equals("NO_DATA_FOUND_VALUE")) { + countyCode = get.countyCode(); } - String hierarchy = get.getCountryName() + ", " + get.getProvinceName() + ", " + countyname + ", " + placeName; + String hierarchy = get.countryName() + ", " + get.provinceName() + ", " + countyname + ", " + placeName; - if (states.containsKey(get.getProvinceName())) { - StateCentroid entry = states.get(get.getProvinceName()); + if (states.containsKey(get.provinceName())) { + StateCentroid entry = states.get(get.provinceName()); entry.count++; entry.latSum += Double.parseDouble(lat); entry.longSum += Double.parseDouble(lon); @@ -110,7 +115,7 @@ public class USGSProcessor { centroid.count = 1; centroid.latSum = Double.parseDouble(lat); centroid.longSum = Double.parseDouble(lon); - states.put(get.getProvinceName(), centroid); + states.put(get.provinceName(), centroid); } doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES)); @@ -118,9 +123,9 @@ public class USGSProcessor { doc.add(new TextField("latitude", lat, Field.Store.YES)); doc.add(new TextField("longitude", lon, Field.Store.YES)); doc.add(new StringField("loctype", dsg, Field.Store.YES)); - doc.add(new StringField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES)); - doc.add(new StringField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES)); - doc.add(new StringField("countycode", (get.getCountryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES)); + doc.add(new StringField("admincode", (get.countryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES)); + doc.add(new StringField("countrycode", get.countryCode().toLowerCase(), Field.Store.YES)); + doc.add(new StringField("countycode", (get.countryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES)); doc.add(new StringField("locid", id, Field.Store.YES)); doc.add(new StringField("gazsource", "usgs", Field.Store.YES)); @@ -231,16 +236,18 @@ public class USGSProcessor { if (adm == null) { continue; } - String province = adm.getProvinceName(); - String country = adm.getCountryName(); + String province = adm.provinceName(); + String country = adm.countryName(); /* * this is the standard format of the country context file... Geonames * data will have an empty string for the county */ - String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\t" - + "(U\\.S\\.[ $]|U\\.S\\.A\\.[ $]|United States|the US[ $]|a us[ $])" + "\t" + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n"; + String line = adm.countryCode() + TAB + adm.getProvCode() + TAB + adm.countyCode() + + TAB + country + TAB + province + TAB + adm.countyName() + TAB + + "(U\\.S\\.[ $]|U\\.S\\.A\\.[ $]|United States|the US[ $]|a us[ $])" + + TAB + adm.provinceName() + TAB + adm.countyName() + "\n"; bw.write(line); - /// System.out.println(line); + // System.out.println(line); } } catch (IOException ex) { Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex); diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java index b706c5f..726c809 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java @@ -23,6 +23,7 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.regex.Pattern; + import opennlp.addons.geoentitylinker.AdminBoundaryContext; import opennlp.tools.entitylinker.EntityLinkerProperties; import opennlp.tools.entitylinker.BaseLink; @@ -36,18 +37,16 @@ import opennlp.tools.util.Span; * indicator of Germany, it is more likely to be Berlin Germany than Berlin * Connecticut (if Connecticut is mentioned further down in the article). */ -public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> { +public class CountryProximityScorer implements LinkedEntityScorer<BaseLink, AdminBoundaryContext> { private Map<String, Set<String>> nameCodesMap; String dominantCode = ""; private Map<String, String> regexMap = new HashMap<>(); @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { - + public void score(List<LinkedSpan<BaseLink>> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { regexMap = additionalContext.getCountryRegexMap(); score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000); - } /** @@ -70,7 +69,7 @@ public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryC * Named Entity. * @return */ - public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { + public List<LinkedSpan<BaseLink>> score(List<LinkedSpan<BaseLink>> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { this.nameCodesMap = nameCodesMap; setDominantCode(countryHits); for (LinkedSpan<BaseLink> linkedspan : linkedData) { @@ -254,10 +253,9 @@ public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryC for (double d : windowOfAverages) { sum += d; } - double result = sum / windowOfAverages.size(); //TODO: ++ prob when large amounts of mentions for a code //System.out.println("avg of window:" + result); - return result; + return sum / windowOfAverages.size(); } /** @@ -274,4 +272,5 @@ public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryC d = d == null ? 0d : d; return d; } + } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java index 5367122..a9a7e3e 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; + import opennlp.addons.geoentitylinker.AdminBoundaryContext; import opennlp.addons.geoentitylinker.GazetteerEntry; import opennlp.tools.entitylinker.EntityLinkerProperties; @@ -30,15 +31,14 @@ import opennlp.tools.util.Span; * * Generates scores based on string comparisons levenstein and dice */ -public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryContext> { +public class FuzzyStringMatchScorer implements LinkedEntityScorer<GazetteerEntry, AdminBoundaryContext> { @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + public void score(List<LinkedSpan<GazetteerEntry>> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { - for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) { + for (LinkedSpan<GazetteerEntry> linkedSpan : linkedSpans) { for (BaseLink link : linkedSpan.getLinkedEntries()) { - if (link instanceof GazetteerEntry) { - GazetteerEntry entry = (GazetteerEntry) link; + if (link instanceof GazetteerEntry entry) { String hierarchy = entry.getHierarchy(); if (hierarchy != null) { Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(), 2); @@ -50,12 +50,10 @@ public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryC if (placename != null) { Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), placename, 2); link.getScoreMap().put("placenamedicecoef", dice); - } } } } - } /** @@ -75,14 +73,14 @@ public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryC List<String> s2Grams = new ArrayList<>(); String[] split1 = s1.split("[ ,]"); for (String token : split1) { - if (token.trim().equals("")) { + if (token.trim().isEmpty()) { continue; } s1Grams.add(token); } String[] split2 = s2.split("[ ,]"); for (String token : split2) { - if (token.trim().equals("")) { + if (token.trim().isEmpty()) { continue; } s2Grams.add(token); @@ -99,8 +97,7 @@ public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryC return Math.min(Math.min(a, b), c); } - public int getLevenshteinDistance(CharSequence str1, - CharSequence str2) { + public int getLevenshteinDistance(CharSequence str1, CharSequence str2) { int[][] distance = new int[str1.length() + 1][str2.length() + 1]; for (int i = 0; i <= str1.length(); i++) { @@ -112,9 +109,7 @@ public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryC for (int i = 1; i <= str1.length(); i++) { for (int j = 1; j <= str2.length(); j++) { - distance[i][j] = minimum( - distance[i - 1][j] + 1, - distance[i][j - 1] + 1, + distance[i][j] = minimum(distance[i - 1][j] + 1, distance[i][j - 1] + 1, distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1)); } } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java index 6b245d6..30f98c4 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java @@ -18,10 +18,10 @@ package opennlp.addons.geoentitylinker.scoring; import java.util.ArrayList; import java.util.List; import java.util.Map; + import opennlp.addons.geoentitylinker.AdminBoundaryContext; import opennlp.addons.geoentitylinker.GazetteerEntry; import opennlp.tools.entitylinker.EntityLinkerProperties; -import opennlp.tools.entitylinker.BaseLink; import opennlp.tools.entitylinker.LinkedSpan; import opennlp.tools.util.Span; @@ -29,33 +29,28 @@ import opennlp.tools.util.Span; * Scores toponymns based on geographic point binning. Based on the heuristic * that docs are generally about a small amount of locations, so one can detect * outliers by finding those points that are not near the majority - * */ -public class GeoHashBinningScorer implements LinkedEntityScorer<AdminBoundaryContext> { +public class GeoHashBinningScorer implements LinkedEntityScorer<GazetteerEntry, AdminBoundaryContext> { private final PointClustering CLUSTERER = new PointClustering(); - private int PRECISION = 3; + private final static int PRECISION = 3; @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + public void score(List<LinkedSpan<GazetteerEntry>> linkedSpans, String docText, Span[] sentenceSpans, + EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { List<GazetteerEntry> allGazEntries = new ArrayList<>(); /* * collect all the gaz entry references */ - for (LinkedSpan<BaseLink> ls : linkedSpans) { - for (BaseLink bl : ls.getLinkedEntries()) { - if (bl instanceof GazetteerEntry) { - allGazEntries.add((GazetteerEntry) bl); - } - } + for (LinkedSpan<GazetteerEntry> ls : linkedSpans) { + allGazEntries.addAll(ls.getLinkedEntries()); } /* * use the point clustering to score each hit */ Map<String, List<GazetteerEntry>> cluster = CLUSTERER.cluster(allGazEntries, PRECISION); CLUSTERER.scoreClusters(cluster); - } } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java index 8054d6f..ad9f220 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java @@ -16,6 +16,8 @@ package opennlp.addons.geoentitylinker.scoring; import java.util.List; + +import opennlp.tools.entitylinker.BaseLink; import opennlp.tools.entitylinker.EntityLinkerProperties; import opennlp.tools.entitylinker.LinkedSpan; import opennlp.tools.util.Span; @@ -23,18 +25,21 @@ import opennlp.tools.util.Span; /** * Structure for scoring linked entities. The Map logically represents a pair : * "Score type" to the "actual Score." - * @param <T> a generic for providing additional context + * + * @param <L> A template type for the {@link LinkedSpan} type. + * @param <T> A template type for providing additional context. */ -public interface LinkedEntityScorer<T> { +public interface LinkedEntityScorer<L extends BaseLink, T> { -/** - * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan - * this method internally affects the reference to linkedSpans that was passed in - * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored - * @param docText the full text of the document. - * @param sentenceSpans the sentence spans the correspond to the document text - * @param properties the entitylinker properties config file - * @param additionalContext any additional data required to perform the scoring operation - */ - void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext); + /** + * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan + * this method internally affects the reference to linkedSpans that was passed in + * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored + * @param docText the full text of the document. + * @param sentenceSpans the sentence spans that corresponds to the document text + * @param properties the entitylinker properties config file + * @param additionalContext any additional data required to perform the scoring operation + */ + void score(List<LinkedSpan<L>> linkedSpans, String docText, Span[] sentenceSpans, + EntityLinkerProperties properties, T additionalContext); } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java index 1c39855..d227b8d 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java @@ -20,6 +20,10 @@ import java.lang.invoke.MethodHandles; import java.util.HashMap; import java.util.List; import java.util.Map; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import opennlp.addons.geoentitylinker.AdminBoundaryContext; import opennlp.tools.doccat.DoccatModel; import opennlp.tools.doccat.DocumentCategorizerME; @@ -27,14 +31,11 @@ import opennlp.tools.entitylinker.EntityLinkerProperties; import opennlp.tools.entitylinker.BaseLink; import opennlp.tools.entitylinker.LinkedSpan; import opennlp.tools.util.Span; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** - * * Utilizes a doccat model to score toponyms based on surrounding context */ -public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext> { +public class ModelBasedScorer implements LinkedEntityScorer<BaseLink, AdminBoundaryContext> { private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -44,11 +45,12 @@ public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext boolean modelexists = false; @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + public void score(List<LinkedSpan<BaseLink>> linkedSpans, String docText, Span[] sentenceSpans, + EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { try { if (doccatModel == null) { String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", ""); - if (path.equals("")) { + if (path.isEmpty()) { return; } modelexists = true; @@ -58,7 +60,7 @@ public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS); for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) { Map<String, Double> scores = this.getScore(entry.getValue()); - for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) { + for (BaseLink link : linkedSpans.get(entry.getKey()).getLinkedEntries()) { double score = 0d; if (scores.containsKey(link.getItemParentID())) { score = scores.get(link.getItemParentID()); @@ -83,7 +85,8 @@ public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext * @return a map of the index of the linked span to the string of surrounding * text: Map<indexofspan,surrounding text> */ - public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) { + public Map<Integer, String> generateProximalFeatures(List<LinkedSpan<BaseLink>> linkedSpans, + Span[] sentenceSpans, String docText, int radius) { Map<Integer, String> featureBags = new HashMap<>(); Map<Integer, Integer> nameMentionMap = new HashMap<>(); /* @@ -144,7 +147,7 @@ public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext return chunk; } - private Map<String, Double> getScore(String text) throws Exception { + private Map<String, Double> getScore(String text) { Map<String, Double> scoreMap = new HashMap<>(); double[] categorize = documentCategorizerME.categorize(List.of(text).toArray(new String[0])); int catSize = documentCategorizerME.getNumberOfCategories(); diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java index 9a3e2d1..4a3b05a 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; + import opennlp.addons.geoentitylinker.AdminBoundaryContext; import opennlp.addons.geoentitylinker.GazetteerEntry; import opennlp.tools.entitylinker.EntityLinkerProperties; @@ -29,17 +30,21 @@ import opennlp.tools.util.Span; * * @author mgiaconia */ -public class PlacetypeScorer implements LinkedEntityScorer<AdminBoundaryContext> { +public class PlacetypeScorer implements LinkedEntityScorer<GazetteerEntry, AdminBoundaryContext> { - private static final String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT civil Populated_Place".split(" "); - private Map<String, Double> boosetedTypes = new HashMap<>(); + private static final String[] boosts = ("ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD " + + "ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH PPLF PPLG " + + "PPLH PPLL PPLQ PPLR PPLS PPLX STLMT civil Populated_Place").split(" "); + + private final Map<String, Double> boosetedTypes = new HashMap<>(); public PlacetypeScorer() { fillMap(); } @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + public void score(List<LinkedSpan<GazetteerEntry>> linkedSpans, String docText, Span[] sentenceSpans, + EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { for (LinkedSpan<GazetteerEntry> geospan : linkedSpans) { ArrayList<GazetteerEntry> linkedEntries = geospan.getLinkedEntries(); for (GazetteerEntry gazetteerEntry : linkedEntries) { diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java index a8677e4..a49c1aa 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java @@ -58,8 +58,8 @@ public class PointClustering { } public void scoreClusters(Map<String, List<GazetteerEntry>> clusters) { - Double min = 0d; - Double max = -1d; + double min = 0d; + double max = -1d; for (String key : clusters.keySet()) { int size = clusters.get(key).size(); if (size > max) { @@ -84,8 +84,7 @@ public class PointClustering { * @return */ public String geoHash(Double lat, Double lon) { - String encodeLatLon = GeohashUtils.encodeLatLon(lat, lon); - return encodeLatLon; + return GeohashUtils.encodeLatLon(lat, lon); } /** @@ -97,8 +96,7 @@ public class PointClustering { */ public double[] geoHashToPoint(String geohash) { Point decode = GeohashUtils.decode(geohash, SpatialContext.GEO); - double[] coords = new double[]{decode.getX(), decode.getY()}; - return coords; + return new double[]{decode.getX(), decode.getY()}; } /** @@ -110,8 +108,7 @@ public class PointClustering { */ public String geoHashToPointStr(String geohash) { Point decode = GeohashUtils.decode(geohash, SpatialContext.GEO); - String point = decode.getX() + "," + decode.getY(); - return point; + return decode.getX() + "," + decode.getY(); } diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java index 27e9354..6badb60 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java @@ -23,6 +23,7 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.regex.Pattern; + import opennlp.addons.geoentitylinker.AdminBoundaryContext; import opennlp.addons.geoentitylinker.GazetteerEntry; import opennlp.tools.entitylinker.BaseLink; @@ -37,17 +38,15 @@ import opennlp.tools.util.Span; * indicator of Connecticut, it is more likely to be Berlin Connecticut than * Berlin Germany (if Germany did not exist in, or is mentioned further away in * the article). - * - * */ -public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> { +public class ProvinceProximityScorer implements LinkedEntityScorer<BaseLink, AdminBoundaryContext> { private Map<String, Set<String>> nameCodesMap; String dominantCode = ""; private Map<String, String> regexMap = new HashMap<>(); @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + public void score(List<LinkedSpan<BaseLink>> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { if (!additionalContext.getProvHits().isEmpty()) { regexMap = additionalContext.getProvinceRegexMap(); score(linkedSpans, additionalContext.getProvMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000); @@ -81,11 +80,10 @@ public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundary * Named Entity. * @return */ - public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { + public List<LinkedSpan<BaseLink>> score(List<LinkedSpan<BaseLink>> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { this.nameCodesMap = nameCodesMap; setDominantCode(countryHits); for (LinkedSpan<BaseLink> linkedspan : linkedData) { - linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist); } return linkedData; @@ -116,7 +114,8 @@ public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundary * @param span * @return */ - private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) { + private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, + LinkedSpan<BaseLink> span, Integer maxAllowedDistance) { Double score = 0.0; /* * get the index of the actual span, begining of sentence //should generate @@ -125,14 +124,14 @@ public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundary */ int sentenceIdx = span.getSentenceid(); int sentIndexInDoc = sentences[sentenceIdx].getStart(); - /** + /* * create a map of all the span's proximal country mentions in the document * Map< countrycode, set of <distances from this NamedEntity>> */ Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<>(); //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>> for (String cCode : countryHits.keySet()) { -//iterate over all the regex start values and calculate an offset + //iterate over all the regex start values and calculate an offset for (Integer cHit : countryHits.get(cCode)) { Integer absDist = Math.abs(sentIndexInDoc - cHit); //only include near mentions based on a heuristic @@ -151,7 +150,7 @@ public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundary } //we now know how far this named entity is from every country mention in the document - /** + /* * the gaz matches that have a country code that have mentions in the doc * that are closest to the Named Entity should return the best score. * Analyzemap generates a likelihood score that the toponym from the gaz is @@ -209,7 +208,8 @@ public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundary * @param span * @return */ - private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) { + private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, + Span[] sentences, LinkedSpan<BaseLink> span) { Map<String, Double> scoreMap = new HashMap<>(); if (distanceMap.isEmpty()) { @@ -232,7 +232,7 @@ public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundary for (Integer i : distanceMap.get(key)) { Double norm = normalize(i, min, max); //reverse the normed distance so low numbers (closer) are better - //this could be improved with a "decaying " function using an imcreaseing negative exponent + //this could be improved with a "decaying " function using an increasing negative exponent Double reverse = Math.abs(norm - 1); normalizedDistances.add(reverse); } @@ -271,10 +271,9 @@ public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundary for (double d : windowOfAverages) { sum += d; } - double result = sum / windowOfAverages.size(); //TODO: ++ prob when large amounts of mentions for a code //System.out.println("avg of window:" + result); - return result; + return sum / windowOfAverages.size(); } /**
