Fix checkstyle errors in geoentitylinker
Project: http://git-wip-us.apache.org/repos/asf/opennlp-addons/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp-addons/commit/9adc2525 Tree: http://git-wip-us.apache.org/repos/asf/opennlp-addons/tree/9adc2525 Diff: http://git-wip-us.apache.org/repos/asf/opennlp-addons/diff/9adc2525 Branch: refs/heads/master Commit: 9adc2525cd9de0617fe5ff72df73da879aed9047 Parents: 6c142db Author: Jörn Kottmann <[email protected]> Authored: Mon Apr 24 14:34:58 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Mon Apr 24 15:19:55 2017 +0200 ---------------------------------------------------------------------- .../addons/geoentitylinker/AdminBoundary.java | 278 +++--- .../geoentitylinker/AdminBoundaryContext.java | 308 +++---- .../AdminBoundaryContextGenerator.java | 847 ++++++++++--------- .../geoentitylinker/CountryContextEntry.java | 238 +++--- .../addons/geoentitylinker/GazetteerEntry.java | 366 ++++---- .../geoentitylinker/GazetteerSearchCache.java | 100 +-- .../geoentitylinker/GazetteerSearcher.java | 526 ++++++------ .../addons/geoentitylinker/GeoEntityLinker.java | 451 +++++----- .../indexing/GazetteerIndexer.java | 414 ++++----- .../indexing/GeonamesFileDownloader.java | 280 +++--- .../indexing/GeonamesProcessor.java | 592 ++++++------- .../indexing/RegionProcessor.java | 228 ++--- .../geoentitylinker/indexing/USGSProcessor.java | 505 +++++------ .../scoring/CountryProximityScorer.java | 564 ++++++------ .../scoring/FuzzyStringMatchScorer.java | 248 +++--- .../scoring/GeoHashBinningScorer.java | 126 +-- .../scoring/LinkedEntityScorer.java | 82 +- .../scoring/ModelBasedScorer.java | 323 +++---- .../scoring/PlacetypeScorer.java | 166 ++-- .../scoring/PointClustering.java | 250 +++--- .../scoring/ProvinceProximityScorer.java | 590 ++++++------- 21 files changed, 3767 insertions(+), 3715 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java index a01b0bb..53f80c6 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java @@ -1,138 +1,140 @@ -/* - * Copyright 2014 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker; - -import java.util.Objects; - -/** - * Stores an admin boundary down to the US county level. Only US places from the - * USGS Gazetteer will have county level info - */ -public class AdminBoundary { - - private static final String NO_DATA_FOUND_VALUE = "NO_DATA_FOUND"; - private final String countryCode; - private final String provinceCode; - private final String provinceName; - private final String countryName; - private final String countyName; - private final String countyCode; - private final String countryRegex; - private final String provinceRegex; - private final String countyRegex; - - - public AdminBoundary(String countryCode, String countryName, String provinceCode, String provinceName, String countyCode, String countyName, - String countryRegex, String provinceRegex, String countyRegex) { - this.countryCode = countryCode; - this.provinceCode = provinceCode; - this.provinceName = provinceName; - this.countryName = countryName; - this.countyName = countyName.equals("") ? NO_DATA_FOUND_VALUE : countyName; - this.countyCode = countyCode.equals("") ? NO_DATA_FOUND_VALUE : countyCode; - this.countryRegex = countryRegex; - this.provinceRegex = provinceRegex; - this.countyRegex = countyRegex; - } - - public String getCountryCode() { - return countryCode; - } - - public String getProvCode() { - return provinceCode; - } - - public String getProvinceName() { - return provinceName; - } - - public String getCountryName() { - return countryName; - } - - public String getCountyName() { - return countyName; - } - - public String getCountyCode() { - return countyCode; - } - - @Override - public String toString() { - return "AdminBoundary{" + "countryCode=" + countryCode + ", provinceCode=" + provinceCode + ", provinceName=" + provinceName + ", countryName=" + countryName + ", countyName=" + countyName + ", countyCode=" + countyCode + '}'; - } - - @Override - public int hashCode() { - int hash = 7; - hash = 11 * hash + Objects.hashCode(this.countryCode); - hash = 11 * hash + Objects.hashCode(this.provinceCode); - hash = 11 * hash + Objects.hashCode(this.provinceName); - hash = 11 * hash + Objects.hashCode(this.countryName); - hash = 11 * hash + Objects.hashCode(this.countyName); - hash = 11 * hash + Objects.hashCode(this.countyCode); - return hash; - } - - @Override - public boolean equals(Object obj) { - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - final AdminBoundary other = (AdminBoundary) obj; - if (!Objects.equals(this.countryCode, other.countryCode)) { - return false; - } - if (!Objects.equals(this.provinceCode, other.provinceCode)) { - return false; - } - if (!Objects.equals(this.provinceName, other.provinceName)) { - return false; - } - if (!Objects.equals(this.countryName, other.countryName)) { - return false; - } - if (!Objects.equals(this.countyName, other.countyName)) { - return false; - } - if (!Objects.equals(this.countyCode, other.countyCode)) { - return false; - } - return true; - } - - public String getProvinceCode() { - return provinceCode; - } - - public String getCountryRegex() { - return countryRegex; - } - - public String getProvinceRegex() { - return provinceRegex; - } - - public String getCountyRegex() { - return countyRegex; - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker; + +import java.util.Objects; + +/** + * Stores an admin boundary down to the US county level. Only US places from the + * USGS Gazetteer will have county level info + */ +public class AdminBoundary { + + private static final String NO_DATA_FOUND_VALUE = "NO_DATA_FOUND"; + private final String countryCode; + private final String provinceCode; + private final String provinceName; + private final String countryName; + private final String countyName; + private final String countyCode; + private final String countryRegex; + private final String provinceRegex; + private final String countyRegex; + + + public AdminBoundary(String countryCode, String countryName, String provinceCode, String provinceName, String countyCode, String countyName, + String countryRegex, String provinceRegex, String countyRegex) { + this.countryCode = countryCode; + this.provinceCode = provinceCode; + this.provinceName = provinceName; + this.countryName = countryName; + this.countyName = countyName.equals("") ? NO_DATA_FOUND_VALUE : countyName; + this.countyCode = countyCode.equals("") ? NO_DATA_FOUND_VALUE : countyCode; + this.countryRegex = countryRegex; + this.provinceRegex = provinceRegex; + this.countyRegex = countyRegex; + } + + public String getCountryCode() { + return countryCode; + } + + public String getProvCode() { + return provinceCode; + } + + public String getProvinceName() { + return provinceName; + } + + public String getCountryName() { + return countryName; + } + + public String getCountyName() { + return countyName; + } + + public String getCountyCode() { + return countyCode; + } + + @Override + public String toString() { + return "AdminBoundary{" + "countryCode=" + countryCode + ", provinceCode=" + provinceCode + ", provinceName=" + provinceName + ", countryName=" + countryName + ", countyName=" + countyName + ", countyCode=" + countyCode + '}'; + } + + @Override + public int hashCode() { + int hash = 7; + hash = 11 * hash + Objects.hashCode(this.countryCode); + hash = 11 * hash + Objects.hashCode(this.provinceCode); + hash = 11 * hash + Objects.hashCode(this.provinceName); + hash = 11 * hash + Objects.hashCode(this.countryName); + hash = 11 * hash + Objects.hashCode(this.countyName); + hash = 11 * hash + Objects.hashCode(this.countyCode); + return hash; + } + + @Override + public boolean equals(Object obj) { + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final AdminBoundary other = (AdminBoundary) obj; + if (!Objects.equals(this.countryCode, other.countryCode)) { + return false; + } + if (!Objects.equals(this.provinceCode, other.provinceCode)) { + return false; + } + if (!Objects.equals(this.provinceName, other.provinceName)) { + return false; + } + if (!Objects.equals(this.countryName, other.countryName)) { + return false; + } + if (!Objects.equals(this.countyName, other.countyName)) { + return false; + } + if (!Objects.equals(this.countyCode, other.countyCode)) { + return false; + } + return true; + } + + public String getProvinceCode() { + return provinceCode; + } + + public String getCountryRegex() { + return countryRegex; + } + + public String getProvinceRegex() { + return provinceRegex; + } + + public String getCountyRegex() { + return countyRegex; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java index 4fccffb..f092ba1 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java @@ -1,153 +1,155 @@ -/* - * Copyright 2014 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -public class AdminBoundaryContext { - - private final Map<String, Set<Integer>> countryMentions; - private final Map<String, Set<Integer>> provMentions; - private final Map<String, Set<Integer>> countyMentions; - private final Set<String> countryHits; - private final Set<String> provHits; - private final Set<String> countyHits; - private final Map<String, String> countryRefMap; - private final Map<String, String> countryRegexMap; - private final Map<String, String> countyRegexMap; - private final Map<String, String> provinceRegexMap; - private final Map<String, Map<String, String>> provRefMap; - private final Map<String, Map<String, String>> countyRefMap; - private final Set<String> whereClauses; - private final Map<String, Set<String>> nameCodesMap; - - public AdminBoundaryContext(Map<String, Set<Integer>> countryMentions, - Map<String, Set<Integer>> provMentions, - Map<String, Set<Integer>> countyMentions, - Set<String> countryHits, - Set<String> provHits, - Set<String> countyHits, - Map<String, String> countryRefMap, - Map<String, Map<String, String>> provRefMap, - Map<String, Map<String, String>> countyRefMap, Map<String, Set<String>> nameCodesMap, Map<String, String> countryRegexMap, Map<String, String> provinceRegexMap, - Map<String, String> countyRegexMap) { - this.countryMentions = countryMentions; - this.provMentions = provMentions; - this.countyMentions = countyMentions; - this.countryHits = countryHits; - this.provHits = provHits; - this.countyHits = countyHits; - this.countryRefMap = countryRefMap; - this.provRefMap = provRefMap; - this.countyRefMap = countyRefMap; - this.whereClauses = setWhereClauses(); - this.nameCodesMap = nameCodesMap; - this.countryRegexMap = countryRegexMap; - this.provinceRegexMap = provinceRegexMap; - this.countyRegexMap = countyRegexMap; - } - - public Map<String, Set<String>> getNameCodesMap() { - return nameCodesMap; - } - - public Map<String, Set<Integer>> getCountryMentions() { - return countryMentions; - } - - public Map<String, Set<Integer>> getProvMentions() { - return provMentions; - } - - public Map<String, Set<Integer>> getCountyMentions() { - return countyMentions; - } - - public Set<String> getCountryHits() { - return countryHits; - } - - public Set<String> getProvHits() { - return provHits; - } - - public Set<String> getCountyHits() { - return countyHits; - } - - public Map<String, String> getCountryRefMap() { - return countryRefMap; - } - - public Map<String, Map<String, String>> getProvRefMap() { - return provRefMap; - } - - public Map<String, Map<String, String>> getCountyRefMap() { - return countyRefMap; - } - - public Set<String> getWhereClauses() { - return whereClauses; - } - - private Set<String> setWhereClauses() { - Set<String> clauses = new HashSet<>(); - for (String countryCode : this.getCountryHits()) { - String gazType = countryCode.toLowerCase().equals("us") ? " AND gazsource:usgs" : " AND gazsource:geonames"; - if (countryCode.toLowerCase().matches(".*rg[0-9].*")) { - gazType = " AND gazsource:region"; - } - Map<String, String> provsForCountry = this.getProvRefMap().get(countryCode); - if (provsForCountry == null) { - provsForCountry = new HashMap<>(); - } - Map<String, String> provs = new HashMap<>(); - -// if (!provsForCountry.isEmpty()) { -// for (String pcode : provsForCountry.keySet()) { -// if (this.getProvHits().contains(pcode)) { -// provs.put(pcode, provsForCountry.get(pcode)); -// -// clauses.add(" countrycode:" + countryCode + " AND admincode:" + pcode + gazType); -// -// } -// } -// } - if (provs.isEmpty()) { - //got a country with no mentioned provs - clauses.add(" countrycode:" + countryCode + gazType); - } - } - return clauses; - } - - public Map<String, String> getCountryRegexMap() { - return countryRegexMap; - } - - public Map<String, String> getCountyRegexMap() { - return countyRegexMap; - } - - public Map<String, String> getProvinceRegexMap() { - return provinceRegexMap; - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +public class AdminBoundaryContext { + + private final Map<String, Set<Integer>> countryMentions; + private final Map<String, Set<Integer>> provMentions; + private final Map<String, Set<Integer>> countyMentions; + private final Set<String> countryHits; + private final Set<String> provHits; + private final Set<String> countyHits; + private final Map<String, String> countryRefMap; + private final Map<String, String> countryRegexMap; + private final Map<String, String> countyRegexMap; + private final Map<String, String> provinceRegexMap; + private final Map<String, Map<String, String>> provRefMap; + private final Map<String, Map<String, String>> countyRefMap; + private final Set<String> whereClauses; + private final Map<String, Set<String>> nameCodesMap; + + public AdminBoundaryContext(Map<String, Set<Integer>> countryMentions, + Map<String, Set<Integer>> provMentions, + Map<String, Set<Integer>> countyMentions, + Set<String> countryHits, + Set<String> provHits, + Set<String> countyHits, + Map<String, String> countryRefMap, + Map<String, Map<String, String>> provRefMap, + Map<String, Map<String, String>> countyRefMap, Map<String, Set<String>> nameCodesMap, Map<String, String> countryRegexMap, Map<String, String> provinceRegexMap, + Map<String, String> countyRegexMap) { + this.countryMentions = countryMentions; + this.provMentions = provMentions; + this.countyMentions = countyMentions; + this.countryHits = countryHits; + this.provHits = provHits; + this.countyHits = countyHits; + this.countryRefMap = countryRefMap; + this.provRefMap = provRefMap; + this.countyRefMap = countyRefMap; + this.whereClauses = setWhereClauses(); + this.nameCodesMap = nameCodesMap; + this.countryRegexMap = countryRegexMap; + this.provinceRegexMap = provinceRegexMap; + this.countyRegexMap = countyRegexMap; + } + + public Map<String, Set<String>> getNameCodesMap() { + return nameCodesMap; + } + + public Map<String, Set<Integer>> getCountryMentions() { + return countryMentions; + } + + public Map<String, Set<Integer>> getProvMentions() { + return provMentions; + } + + public Map<String, Set<Integer>> getCountyMentions() { + return countyMentions; + } + + public Set<String> getCountryHits() { + return countryHits; + } + + public Set<String> getProvHits() { + return provHits; + } + + public Set<String> getCountyHits() { + return countyHits; + } + + public Map<String, String> getCountryRefMap() { + return countryRefMap; + } + + public Map<String, Map<String, String>> getProvRefMap() { + return provRefMap; + } + + public Map<String, Map<String, String>> getCountyRefMap() { + return countyRefMap; + } + + public Set<String> getWhereClauses() { + return whereClauses; + } + + private Set<String> setWhereClauses() { + Set<String> clauses = new HashSet<>(); + for (String countryCode : this.getCountryHits()) { + String gazType = countryCode.toLowerCase().equals("us") ? " AND gazsource:usgs" : " AND gazsource:geonames"; + if (countryCode.toLowerCase().matches(".*rg[0-9].*")) { + gazType = " AND gazsource:region"; + } + Map<String, String> provsForCountry = this.getProvRefMap().get(countryCode); + if (provsForCountry == null) { + provsForCountry = new HashMap<>(); + } + Map<String, String> provs = new HashMap<>(); + +// if (!provsForCountry.isEmpty()) { +// for (String pcode : provsForCountry.keySet()) { +// if (this.getProvHits().contains(pcode)) { +// provs.put(pcode, provsForCountry.get(pcode)); +// +// clauses.add(" countrycode:" + countryCode + " AND admincode:" + pcode + gazType); +// +// } +// } +// } + if (provs.isEmpty()) { + //got a country with no mentioned provs + clauses.add(" countrycode:" + countryCode + gazType); + } + } + return clauses; + } + + public Map<String, String> getCountryRegexMap() { + return countryRegexMap; + } + + public Map<String, String> getCountyRegexMap() { + return countyRegexMap; + } + + public Map<String, String> getProvinceRegexMap() { + return provinceRegexMap; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java index b645156..185e171 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java @@ -1,422 +1,425 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.logging.Level; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import opennlp.tools.entitylinker.EntityLinkerProperties; -import org.apache.log4j.Logger; - -/** - * Finds instances of country mentions in a String, typically a document text. - * Used to boost or degrade scoring of linked geo entities - * - */ -public class AdminBoundaryContextGenerator { - - private static final Logger LOGGER = Logger.getLogger(AdminBoundaryContextGenerator.class); - private List<CountryContextEntry> countrydata; - private Map<String, Set<String>> nameCodesMap = new HashMap<>(); - private Map<String, Set<Integer>> countryMentions = new HashMap<>(); - - Map<String, String> countryRegexMap = new HashMap<>(); - Map<String, String> provinceRegexMap = new HashMap<>(); - Map<String, String> countyRegexMap = new HashMap<>(); - - private Set<CountryContextEntry> countryHits = new HashSet<>(); - private EntityLinkerProperties properties; - private List<AdminBoundary> adminBoundaryData= new ArrayList<>(); - private Set<AdminBoundary> adminBoundaryHits = new HashSet<>(); - private AdminBoundaryContext context; - - public AdminBoundaryContext getContext(String text) { - context = null; - nameCodesMap.clear(); - context = process(text); - - return context; - } - - private Set<String> countryHitSet = new HashSet<>(); - private Map<String, String> countryMap = new HashMap<>(); - private Map<String, Map<String, String>> provMap = new HashMap<>(); - private Map<String, Map<String, String>> countyMap = new HashMap<>(); - - private Map<String, Set<Integer>> provMentions = new HashMap<>(); - private Map<String, Set<Integer>> countyMentions = new HashMap<>(); - - private Set<String> provHits = new HashSet<String>(); - private Set<String> countyHits = new HashSet<String>(); - - public static void main(String[] args) { - try { - AdminBoundaryContextGenerator countryContext - = new AdminBoundaryContextGenerator(new EntityLinkerProperties(new File("C:\\Temp\\gaz_data\\newCountryContextfile.txt"))); - - AdminBoundaryContext c = countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a nice place. Eastern Africa people are cool."); - System.out.println(c); - } catch (Exception ex) { - java.util.logging.Logger.getLogger(AdminBoundaryContextGenerator.class.getName()).log(Level.SEVERE, null, ex); - } - } - - public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws IOException { - this.properties = properties; - if (countrydata == null) { - String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", ""); - if (path == null || path.trim().isEmpty()) { - throw new IOException("missing country context data configuration. Property opennlp.geoentitylinker.countrycontext.filepath must have a valid path value in entitylinker properties file"); - } - File countryContextFile = new File(path); - if (countryContextFile == null || !countryContextFile.exists()) { - throw new IOException("missing country context file"); - } - //countrydata = getCountryContextFromFile(countryContextFile); - getContextFromFile(countryContextFile); - if (adminBoundaryData.isEmpty()) { - throw new IOException("missing country context data"); - } - } - } - - public Map<String, Set<Integer>> getCountryMentions() { - return countryMentions; - } - - /** - * returns the last set of hits after calling regexFind - * - * @return - */ - public Set<CountryContextEntry> getCountryHits() { - return countryHits; - } - - /** - * returns the last name to codes map after calling regexFind - * - * @return - */ - public Map<String, Set<String>> getNameCodesMap() { - return nameCodesMap; - } - - public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) { - this.nameCodesMap = nameCodesMap; - } - - private void reset() { - this.nameCodesMap.clear(); - this.countryHitSet.clear(); - this.countryHits.clear(); - this.countryMentions.clear(); - this.provHits.clear(); - this.provMentions.clear(); - this.countyHits.clear(); - this.countyMentions.clear(); - this.adminBoundaryHits.clear(); - } - - /** - * Finds indicators of countries, provinces, and cities, as per the USGS and - * Geonames gazetteers. The results of this are used to score toponymns - * downstream. The full text of a document should be passed in here. - * - * @param text the full text of the document (block of text). - * @return - */ - private AdminBoundaryContext process(String text) { - try { - - reset(); - Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap, countryHitSet, "country"); - if (!countryhitMap.isEmpty()) { - for (String cc : countryhitMap.keySet()) { - Map<String, String> provsForCc = provMap.get(cc); - if (provsForCc != null) { - provMentions.putAll(regexfind(text, provsForCc, provHits, "province")); - if (provMentions != null) { - for (String prov : provMentions.keySet()) { - Map<String, String> get = countyMap.get(prov); - if (get != null) { - countyMentions.putAll(regexfind(text, get, countyHits, "province")); - } - } - } - } - } - } else { - for (Map<String, String> provsForCc : provMap.values()) { - if (provsForCc != null) { - provMentions = regexfind(text, provsForCc, provHits, "province"); - if (provMentions != null) { - for (String prov : provMentions.keySet()) { - //fake a country hit based on a province hit... this gets fuzzy - String cc = prov.split("\\.")[0]; - if (!countryhitMap.containsKey(cc)) { - countryhitMap.put(cc, provMentions.get(prov)); - countryHitSet.add(cc); - } else { - countryhitMap.get(cc).addAll(provMentions.get(prov)); - } - Map<String, String> get = countyMap.get(prov); - if (get != null) { - countyMentions = regexfind(text, get, countyHits, "oounty"); - } - } - } - } - } - } - - Map<String, String> countryRefMap = new HashMap<>(); - - for (String c : countryHitSet) { - String countryName = countryMap.get(c); - if (countryName != null) { - countryRefMap.put(c, countryName); - } - } - - AdminBoundaryContext context - = new AdminBoundaryContext(countryhitMap, provMentions, countyMentions, countryHitSet, provHits, countyHits, - countryRefMap, provMap, countyMap, nameCodesMap, countryRegexMap, provinceRegexMap, countyRegexMap); - - return context; - } catch (Exception e) { - e.printStackTrace(); - } - return null; - } - - /** - * discovers indicators of admin boundary data using regex. - * - * @param docText the full text - * @param lookupMap a map to use to find names. the key=a location code, the - * value is an actual name. - * @param hitsRef a reference to a set that stores the hits by id - * @return - */ - private Map<String, Set<Integer>> regexfind(String docText, Map<String, String> lookupMap, Set<String> hitsRef, String locationType) { - Map<String, Set<Integer>> mentions = new HashMap<>(); - if (lookupMap == null) { - return mentions; - } - try { - - for (String entry : lookupMap.keySet()) { - - String name = lookupMap.get(entry).toLowerCase(); - if (name == null) { - continue; - } - switch (locationType) { - case "country": - if (this.countryRegexMap.containsKey(entry)) { - name = countryRegexMap.get(entry); - } - break; - - case "province": - if (this.provinceRegexMap.containsKey(entry)) { - name = provinceRegexMap.get(entry); - } - break; - case "county": - if (this.countyRegexMap.containsKey(entry)) { - name = countyRegexMap.get(entry); - } - break; - } - name = "(^|[^\\p{L}\\p{Nd}])" + name.replace(", the", "") + "([^\\p{L}\\p{Nd}]|$)"; - Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); - Matcher rs = regex.matcher(docText); - String code = entry.toLowerCase(); - code = code.trim().replace("", ""); - boolean found = false; - while (rs.find()) { - found = true; - Integer start = rs.start(); - String hit = rs.group().toLowerCase().trim(); - hit = hit.replaceAll("\\.|,|;|\\?|!|\\\\|/|\"|'|=|-|&", ""); - if (mentions.containsKey(code)) { - mentions.get(code).add(start); - } else { - Set<Integer> newset = new HashSet<Integer>(); - newset.add(start); - mentions.put(code, newset); - } - if (!hit.equals("")) { - if (this.nameCodesMap.containsKey(hit)) { - nameCodesMap.get(hit).add(code); - } else { - HashSet<String> newset = new HashSet<String>(); - newset.add(code); - nameCodesMap.put(hit, newset); - } - } - - } - if (found) { - hitsRef.add(code); - - } - } - - } catch (Exception ex) { - LOGGER.error(ex); - ex.printStackTrace(); - - } - - return mentions; - } - - private void getContextFromFile(File countryContextFile) { - if (this.adminBoundaryData != null && !this.adminBoundaryData.isEmpty()) { - return; - } - - BufferedReader reader; - try { - reader = new BufferedReader(new FileReader(countryContextFile)); - String line = ""; - int lineNum = 0; - while ((line = reader.readLine()) != null) { - String[] values = line.split("\t"); - if (lineNum == 0) { - lineNum++; - continue; - //skip column name headers - } - if (values.length == 9) { - AdminBoundary entry = new AdminBoundary( - values[0].toLowerCase().trim().replace("", ""), - values[3].toLowerCase().trim(), - values[1].toLowerCase().trim(), - values[4].toLowerCase().trim(), - values[2].toLowerCase().trim(), - values[5].toLowerCase().trim(), - values[6].toLowerCase().trim(), - values[7].toLowerCase().trim(), - values[8].toLowerCase().trim()); - this.adminBoundaryData.add(entry); - } else { - throw new IllegalArgumentException("Improperly formatted file"); - } - - } - reader.close(); - } catch (IOException ex) { - LOGGER.error(ex); - } - - loadMaps(this.adminBoundaryData); - - } - - private void loadMaps(List<AdminBoundary> boundaries) { - for (AdminBoundary adm : boundaries) { - if (!adm.getCountryCode().equals("null")) { - countryMap.put(adm.getCountryCode(), adm.getCountryName()); - if (countryRegexMap.containsKey(adm.getCountryCode())) { - String currentRegex = countryRegexMap.get(adm.getCountryCode()); - if (currentRegex.length() > adm.getCountryRegex().length()) { - // the longest one wins if they are not all the same for each entry in the file - countryRegexMap.put(adm.getCountryCode(), currentRegex); - }//else do nothing - } else { - countryRegexMap.put(adm.getCountryCode(), adm.getCountryRegex()); - } - - if (!adm.getProvCode().equals("null")) { - Map<String, String> provs = provMap.get(adm.getCountryCode()); - if (provs == null) { - provs = new HashMap<>(); - } - //if (!provs.containsKey(adm.getProvCode())) { - provs.put(adm.getCountryCode() + "." + adm.getProvCode(), adm.getProvinceName()); - provMap.put(adm.getCountryCode(), provs); - // } - - if (!adm.getCountyCode().toLowerCase().equals("no_data_found") && !adm.getCountyName().toLowerCase().equals("no_data_found")) { - Map<String, String> counties = countyMap.get(adm.getCountryCode() + "." + adm.getProvCode()); - if (counties == null) { - counties = new HashMap<>(); - } // if (!counties.containsKey(adm.getCountyCode())) { - String countyid = adm.getCountryCode() + "." + adm.getProvCode() + "." + adm.getCountyCode(); - counties.put(countyid, adm.getCountyName()); - countyMap.put(adm.getCountryCode() + "." + adm.getProvCode(), counties); - // } - - } - - } - } - } - fillProvRegexMap(); - fillCountyRegexMap(); - } - - private void fillProvRegexMap() { - this.provinceRegexMap = new HashMap<>(); - // this.adminBoundaryData - for (AdminBoundary adm : adminBoundaryData) { - - if (provinceRegexMap.containsKey(adm.getProvCode())) { - String currentRegex = provinceRegexMap.get(adm.getProvCode()); - if (currentRegex.length() > adm.getProvinceRegex().length()) { - // the longest one wins if they are not all the same for each entry in the file - provinceRegexMap.put(adm.getProvCode(), currentRegex); - }//else do nothing - } else { - provinceRegexMap.put(adm.getProvCode(), adm.getProvinceRegex()); - } - } - } - - private void fillCountyRegexMap() { - this.countyRegexMap = new HashMap<>(); - // this.adminBoundaryData - for (AdminBoundary adm : adminBoundaryData) { - - if (countyRegexMap.containsKey(adm.getCountyCode())) { - String currentRegex = countyRegexMap.get(adm.getCountyCode()); - if (currentRegex.length() > adm.getCountyRegex().length()) { - // the longest one wins if they are not all the same for each entry in the file - countyRegexMap.put(adm.getCountyCode(), currentRegex); - }//else do nothing - } else { - countyRegexMap.put(adm.getCountyCode(), adm.getCountyRegex()); - } - } - - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.logging.Level; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; + +import opennlp.tools.entitylinker.EntityLinkerProperties; + +/** + * Finds instances of country mentions in a String, typically a document text. + * Used to boost or degrade scoring of linked geo entities + * + */ +public class AdminBoundaryContextGenerator { + + private static final Logger LOGGER = Logger.getLogger(AdminBoundaryContextGenerator.class); + private List<CountryContextEntry> countrydata; + private Map<String, Set<String>> nameCodesMap = new HashMap<>(); + private Map<String, Set<Integer>> countryMentions = new HashMap<>(); + + Map<String, String> countryRegexMap = new HashMap<>(); + Map<String, String> provinceRegexMap = new HashMap<>(); + Map<String, String> countyRegexMap = new HashMap<>(); + + private Set<CountryContextEntry> countryHits = new HashSet<>(); + private EntityLinkerProperties properties; + private List<AdminBoundary> adminBoundaryData= new ArrayList<>(); + private Set<AdminBoundary> adminBoundaryHits = new HashSet<>(); + private AdminBoundaryContext context; + + public AdminBoundaryContext getContext(String text) { + context = null; + nameCodesMap.clear(); + context = process(text); + + return context; + } + + private Set<String> countryHitSet = new HashSet<>(); + private Map<String, String> countryMap = new HashMap<>(); + private Map<String, Map<String, String>> provMap = new HashMap<>(); + private Map<String, Map<String, String>> countyMap = new HashMap<>(); + + private Map<String, Set<Integer>> provMentions = new HashMap<>(); + private Map<String, Set<Integer>> countyMentions = new HashMap<>(); + + private Set<String> provHits = new HashSet<String>(); + private Set<String> countyHits = new HashSet<String>(); + + public static void main(String[] args) { + try { + AdminBoundaryContextGenerator countryContext + = new AdminBoundaryContextGenerator(new EntityLinkerProperties(new File("C:\\Temp\\gaz_data\\newCountryContextfile.txt"))); + + AdminBoundaryContext c = countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a nice place. Eastern Africa people are cool."); + System.out.println(c); + } catch (Exception ex) { + java.util.logging.Logger.getLogger(AdminBoundaryContextGenerator.class.getName()).log(Level.SEVERE, null, ex); + } + } + + public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws IOException { + this.properties = properties; + if (countrydata == null) { + String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", ""); + if (path == null || path.trim().isEmpty()) { + throw new IOException("missing country context data configuration. Property opennlp.geoentitylinker.countrycontext.filepath must have a valid path value in entitylinker properties file"); + } + File countryContextFile = new File(path); + if (countryContextFile == null || !countryContextFile.exists()) { + throw new IOException("missing country context file"); + } + //countrydata = getCountryContextFromFile(countryContextFile); + getContextFromFile(countryContextFile); + if (adminBoundaryData.isEmpty()) { + throw new IOException("missing country context data"); + } + } + } + + public Map<String, Set<Integer>> getCountryMentions() { + return countryMentions; + } + + /** + * returns the last set of hits after calling regexFind + * + * @return + */ + public Set<CountryContextEntry> getCountryHits() { + return countryHits; + } + + /** + * returns the last name to codes map after calling regexFind + * + * @return + */ + public Map<String, Set<String>> getNameCodesMap() { + return nameCodesMap; + } + + public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) { + this.nameCodesMap = nameCodesMap; + } + + private void reset() { + this.nameCodesMap.clear(); + this.countryHitSet.clear(); + this.countryHits.clear(); + this.countryMentions.clear(); + this.provHits.clear(); + this.provMentions.clear(); + this.countyHits.clear(); + this.countyMentions.clear(); + this.adminBoundaryHits.clear(); + } + + /** + * Finds indicators of countries, provinces, and cities, as per the USGS and + * Geonames gazetteers. The results of this are used to score toponymns + * downstream. The full text of a document should be passed in here. + * + * @param text the full text of the document (block of text). + * @return + */ + private AdminBoundaryContext process(String text) { + try { + + reset(); + Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap, countryHitSet, "country"); + if (!countryhitMap.isEmpty()) { + for (String cc : countryhitMap.keySet()) { + Map<String, String> provsForCc = provMap.get(cc); + if (provsForCc != null) { + provMentions.putAll(regexfind(text, provsForCc, provHits, "province")); + if (provMentions != null) { + for (String prov : provMentions.keySet()) { + Map<String, String> get = countyMap.get(prov); + if (get != null) { + countyMentions.putAll(regexfind(text, get, countyHits, "province")); + } + } + } + } + } + } else { + for (Map<String, String> provsForCc : provMap.values()) { + if (provsForCc != null) { + provMentions = regexfind(text, provsForCc, provHits, "province"); + if (provMentions != null) { + for (String prov : provMentions.keySet()) { + //fake a country hit based on a province hit... this gets fuzzy + String cc = prov.split("\\.")[0]; + if (!countryhitMap.containsKey(cc)) { + countryhitMap.put(cc, provMentions.get(prov)); + countryHitSet.add(cc); + } else { + countryhitMap.get(cc).addAll(provMentions.get(prov)); + } + Map<String, String> get = countyMap.get(prov); + if (get != null) { + countyMentions = regexfind(text, get, countyHits, "oounty"); + } + } + } + } + } + } + + Map<String, String> countryRefMap = new HashMap<>(); + + for (String c : countryHitSet) { + String countryName = countryMap.get(c); + if (countryName != null) { + countryRefMap.put(c, countryName); + } + } + + AdminBoundaryContext context + = new AdminBoundaryContext(countryhitMap, provMentions, countyMentions, countryHitSet, provHits, countyHits, + countryRefMap, provMap, countyMap, nameCodesMap, countryRegexMap, provinceRegexMap, countyRegexMap); + + return context; + } catch (Exception e) { + e.printStackTrace(); + } + return null; + } + + /** + * discovers indicators of admin boundary data using regex. + * + * @param docText the full text + * @param lookupMap a map to use to find names. the key=a location code, the + * value is an actual name. + * @param hitsRef a reference to a set that stores the hits by id + * @return + */ + private Map<String, Set<Integer>> regexfind(String docText, Map<String, String> lookupMap, Set<String> hitsRef, String locationType) { + Map<String, Set<Integer>> mentions = new HashMap<>(); + if (lookupMap == null) { + return mentions; + } + try { + + for (String entry : lookupMap.keySet()) { + + String name = lookupMap.get(entry).toLowerCase(); + if (name == null) { + continue; + } + switch (locationType) { + case "country": + if (this.countryRegexMap.containsKey(entry)) { + name = countryRegexMap.get(entry); + } + break; + + case "province": + if (this.provinceRegexMap.containsKey(entry)) { + name = provinceRegexMap.get(entry); + } + break; + case "county": + if (this.countyRegexMap.containsKey(entry)) { + name = countyRegexMap.get(entry); + } + break; + } + name = "(^|[^\\p{L}\\p{Nd}])" + name.replace(", the", "") + "([^\\p{L}\\p{Nd}]|$)"; + Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL); + Matcher rs = regex.matcher(docText); + String code = entry.toLowerCase(); + code = code.trim().replace("", ""); + boolean found = false; + while (rs.find()) { + found = true; + Integer start = rs.start(); + String hit = rs.group().toLowerCase().trim(); + hit = hit.replaceAll("\\.|,|;|\\?|!|\\\\|/|\"|'|=|-|&", ""); + if (mentions.containsKey(code)) { + mentions.get(code).add(start); + } else { + Set<Integer> newset = new HashSet<Integer>(); + newset.add(start); + mentions.put(code, newset); + } + if (!hit.equals("")) { + if (this.nameCodesMap.containsKey(hit)) { + nameCodesMap.get(hit).add(code); + } else { + HashSet<String> newset = new HashSet<String>(); + newset.add(code); + nameCodesMap.put(hit, newset); + } + } + + } + if (found) { + hitsRef.add(code); + + } + } + + } catch (Exception ex) { + LOGGER.error(ex); + ex.printStackTrace(); + + } + + return mentions; + } + + private void getContextFromFile(File countryContextFile) { + if (this.adminBoundaryData != null && !this.adminBoundaryData.isEmpty()) { + return; + } + + BufferedReader reader; + try { + reader = new BufferedReader(new FileReader(countryContextFile)); + String line = ""; + int lineNum = 0; + while ((line = reader.readLine()) != null) { + String[] values = line.split("\t"); + if (lineNum == 0) { + lineNum++; + continue; + //skip column name headers + } + if (values.length == 9) { + AdminBoundary entry = new AdminBoundary( + values[0].toLowerCase().trim().replace("", ""), + values[3].toLowerCase().trim(), + values[1].toLowerCase().trim(), + values[4].toLowerCase().trim(), + values[2].toLowerCase().trim(), + values[5].toLowerCase().trim(), + values[6].toLowerCase().trim(), + values[7].toLowerCase().trim(), + values[8].toLowerCase().trim()); + this.adminBoundaryData.add(entry); + } else { + throw new IllegalArgumentException("Improperly formatted file"); + } + + } + reader.close(); + } catch (IOException ex) { + LOGGER.error(ex); + } + + loadMaps(this.adminBoundaryData); + + } + + private void loadMaps(List<AdminBoundary> boundaries) { + for (AdminBoundary adm : boundaries) { + if (!adm.getCountryCode().equals("null")) { + countryMap.put(adm.getCountryCode(), adm.getCountryName()); + if (countryRegexMap.containsKey(adm.getCountryCode())) { + String currentRegex = countryRegexMap.get(adm.getCountryCode()); + if (currentRegex.length() > adm.getCountryRegex().length()) { + // the longest one wins if they are not all the same for each entry in the file + countryRegexMap.put(adm.getCountryCode(), currentRegex); + }//else do nothing + } else { + countryRegexMap.put(adm.getCountryCode(), adm.getCountryRegex()); + } + + if (!adm.getProvCode().equals("null")) { + Map<String, String> provs = provMap.get(adm.getCountryCode()); + if (provs == null) { + provs = new HashMap<>(); + } + //if (!provs.containsKey(adm.getProvCode())) { + provs.put(adm.getCountryCode() + "." + adm.getProvCode(), adm.getProvinceName()); + provMap.put(adm.getCountryCode(), provs); + // } + + if (!adm.getCountyCode().toLowerCase().equals("no_data_found") && !adm.getCountyName().toLowerCase().equals("no_data_found")) { + Map<String, String> counties = countyMap.get(adm.getCountryCode() + "." + adm.getProvCode()); + if (counties == null) { + counties = new HashMap<>(); + } // if (!counties.containsKey(adm.getCountyCode())) { + String countyid = adm.getCountryCode() + "." + adm.getProvCode() + "." + adm.getCountyCode(); + counties.put(countyid, adm.getCountyName()); + countyMap.put(adm.getCountryCode() + "." + adm.getProvCode(), counties); + // } + + } + + } + } + } + fillProvRegexMap(); + fillCountyRegexMap(); + } + + private void fillProvRegexMap() { + this.provinceRegexMap = new HashMap<>(); + // this.adminBoundaryData + for (AdminBoundary adm : adminBoundaryData) { + + if (provinceRegexMap.containsKey(adm.getProvCode())) { + String currentRegex = provinceRegexMap.get(adm.getProvCode()); + if (currentRegex.length() > adm.getProvinceRegex().length()) { + // the longest one wins if they are not all the same for each entry in the file + provinceRegexMap.put(adm.getProvCode(), currentRegex); + }//else do nothing + } else { + provinceRegexMap.put(adm.getProvCode(), adm.getProvinceRegex()); + } + } + } + + private void fillCountyRegexMap() { + this.countyRegexMap = new HashMap<>(); + // this.adminBoundaryData + for (AdminBoundary adm : adminBoundaryData) { + + if (countyRegexMap.containsKey(adm.getCountyCode())) { + String currentRegex = countyRegexMap.get(adm.getCountyCode()); + if (currentRegex.length() > adm.getCountyRegex().length()) { + // the longest one wins if they are not all the same for each entry in the file + countyRegexMap.put(adm.getCountyCode(), currentRegex); + }//else do nothing + } else { + countyRegexMap.put(adm.getCountyCode(), adm.getCountyRegex()); + } + } + + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java index 324bd0f..3af4a58 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java @@ -1,118 +1,120 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker; - -import java.util.Objects; - -/** - *Stores a tuple from the opennlp.geoentitylinker.countrycontext.txt file, which is used to find country mentions in document text. - * - */ -public class CountryContextEntry { - /* - * rc,cc1, full_name_nd_ro,dsg - */ - - private String rc; - private String cc1; - private String full_name_nd_ro; - private String dsg; - private String provCode; - public CountryContextEntry() { - } - - public CountryContextEntry(String rc, String cc1, String full_name_nd_ro, String dsg) { - this.rc = rc; - this.cc1 = cc1; - this.full_name_nd_ro = full_name_nd_ro; - this.dsg = dsg; - } - - public String getProvCode() { - return provCode; - } - - public void setProvCode(String provCode) { - this.provCode = provCode; - } - - public String getRc() { - return rc; - } - - public void setRc(String rc) { - this.rc = rc; - } - - public String getCc1() { - return cc1; - } - - public void setCc1(String cc1) { - this.cc1 = cc1; - } - - public String getFull_name_nd_ro() { - return full_name_nd_ro; - } - - public void setFull_name_nd_ro(String full_name_nd_ro) { - this.full_name_nd_ro = full_name_nd_ro; - } - - public String getDsg() { - return dsg; - } - - public void setDsg(String dsg) { - this.dsg = dsg; - } - - @Override - public int hashCode() { - int hash = 7; - hash = 17 * hash + Objects.hashCode(this.rc); - hash = 17 * hash + Objects.hashCode(this.cc1); - hash = 17 * hash + Objects.hashCode(this.full_name_nd_ro); - hash = 17 * hash + Objects.hashCode(this.dsg); - return hash; - } - - @Override - public boolean equals(Object obj) { - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - final CountryContextEntry other = (CountryContextEntry) obj; - if (!Objects.equals(this.rc, other.rc)) { - return false; - } - if (!Objects.equals(this.cc1, other.cc1)) { - return false; - } - if (!Objects.equals(this.full_name_nd_ro, other.full_name_nd_ro)) { - return false; - } - if (!Objects.equals(this.dsg, other.dsg)) { - return false; - } - return true; - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker; + +import java.util.Objects; + +/** + *Stores a tuple from the opennlp.geoentitylinker.countrycontext.txt file, which is used to find country mentions in document text. + * + */ +public class CountryContextEntry { + /* + * rc,cc1, full_name_nd_ro,dsg + */ + + private String rc; + private String cc1; + private String full_name_nd_ro; + private String dsg; + private String provCode; + public CountryContextEntry() { + } + + public CountryContextEntry(String rc, String cc1, String full_name_nd_ro, String dsg) { + this.rc = rc; + this.cc1 = cc1; + this.full_name_nd_ro = full_name_nd_ro; + this.dsg = dsg; + } + + public String getProvCode() { + return provCode; + } + + public void setProvCode(String provCode) { + this.provCode = provCode; + } + + public String getRc() { + return rc; + } + + public void setRc(String rc) { + this.rc = rc; + } + + public String getCc1() { + return cc1; + } + + public void setCc1(String cc1) { + this.cc1 = cc1; + } + + public String getFull_name_nd_ro() { + return full_name_nd_ro; + } + + public void setFull_name_nd_ro(String full_name_nd_ro) { + this.full_name_nd_ro = full_name_nd_ro; + } + + public String getDsg() { + return dsg; + } + + public void setDsg(String dsg) { + this.dsg = dsg; + } + + @Override + public int hashCode() { + int hash = 7; + hash = 17 * hash + Objects.hashCode(this.rc); + hash = 17 * hash + Objects.hashCode(this.cc1); + hash = 17 * hash + Objects.hashCode(this.full_name_nd_ro); + hash = 17 * hash + Objects.hashCode(this.dsg); + return hash; + } + + @Override + public boolean equals(Object obj) { + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final CountryContextEntry other = (CountryContextEntry) obj; + if (!Objects.equals(this.rc, other.rc)) { + return false; + } + if (!Objects.equals(this.cc1, other.cc1)) { + return false; + } + if (!Objects.equals(this.full_name_nd_ro, other.full_name_nd_ro)) { + return false; + } + if (!Objects.equals(this.dsg, other.dsg)) { + return false; + } + return true; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java index 6f3ac87..28949a8 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java @@ -1,182 +1,184 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker; - -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; -import opennlp.tools.entitylinker.BaseLink; - -/** - * - * Stores a minimal amount of information from a geographic placenames gazateer - */ -public class GazetteerEntry extends BaseLink { - - private Double latitude; - private Double longitude; - private String source; - private String indexID; - private Map<String, String> indexData = new HashMap<>(); - private String countryCode; - private String provinceCode; - private String hierarchy; - - /** - * returns the id from the lucene document - * - * @return - */ - public String getIndexID() { - return indexID; - } - /* - * sets the id from the lucene document - */ - - public void setIndexID(String indexID) { - this.indexID = indexID; - } - - /** - * returns the latitude from the gazateer - * - * @return - */ - public Double getLatitude() { - return latitude; - } - - /** - * sets the latitude from the gazateer - * - */ - public void setLatitude(Double latitude) { - this.latitude = latitude; - } - - /** - * returns the longitude from the gaz - * - * @return - */ - public Double getLongitude() { - return longitude; - } - - /** - * sets the longitude from the gaz - * - * @param longitude - */ - public void setLongitude(Double longitude) { - this.longitude = longitude; - } - - /** - * returns the source of the gazateer data - * - * @return - */ - public String getSource() { - return source; - } - - /** - * sets the source (the source of the gazateer data) - * - * @param source - */ - public void setSource(String source) { - this.source = source; - } - - /** - * Returns all the other fields in the gazateer in the form of a map - * - * @return - */ - public Map<String, String> getIndexData() { - return indexData; - } - - /** - * sets the other fields in the gazeteer in the form of a map - * - * @param indexData stores all fields in the index as fieldname:value - */ - public void setIndexData(Map<String, String> indexData) { - this.indexData = indexData; - } - - @Override - public String toString() { - - return super.toString() + "\n\t\tGazateerEntry\n" + "\t\tlatitude=" + latitude + ", \n\t\tlongitude=" + longitude + ", \n\t\tsource=" + source + ", \n\t\tindexID=" + indexID + ",\n\t\tindexData=" + indexData + "\n"; - } - - @Override - public int hashCode() { - int hash = 5; - hash = 71 * hash + Objects.hashCode(this.source); - hash = 71 * hash + Objects.hashCode(this.indexID); - return hash; - } - - @Override - public boolean equals(Object obj) { - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - final GazetteerEntry other = (GazetteerEntry) obj; - if (!Objects.equals(this.source, other.source)) { - return false; - } - if (!Objects.equals(this.indexID, other.indexID)) { - return false; - } - return true; - } - - - public String getCountryCode() { - return countryCode; - } - - public void setCountryCode(String countryCode) { - this.countryCode = countryCode; - } - - public String getProvinceCode() { - return provinceCode; - } - - public void setProvinceCode(String provinceCode) { - this.provinceCode = provinceCode; - } - - public String getHierarchy() { - return hierarchy; - } - - public void setHierarchy(String hierarchy) { - this.hierarchy = hierarchy; - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker; + +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import opennlp.tools.entitylinker.BaseLink; + +/** + * + * Stores a minimal amount of information from a geographic placenames gazateer + */ +public class GazetteerEntry extends BaseLink { + + private Double latitude; + private Double longitude; + private String source; + private String indexID; + private Map<String, String> indexData = new HashMap<>(); + private String countryCode; + private String provinceCode; + private String hierarchy; + + /** + * returns the id from the lucene document + * + * @return + */ + public String getIndexID() { + return indexID; + } + /* + * sets the id from the lucene document + */ + + public void setIndexID(String indexID) { + this.indexID = indexID; + } + + /** + * returns the latitude from the gazateer + * + * @return + */ + public Double getLatitude() { + return latitude; + } + + /** + * sets the latitude from the gazateer + * + */ + public void setLatitude(Double latitude) { + this.latitude = latitude; + } + + /** + * returns the longitude from the gaz + * + * @return + */ + public Double getLongitude() { + return longitude; + } + + /** + * sets the longitude from the gaz + * + * @param longitude + */ + public void setLongitude(Double longitude) { + this.longitude = longitude; + } + + /** + * returns the source of the gazateer data + * + * @return + */ + public String getSource() { + return source; + } + + /** + * sets the source (the source of the gazateer data) + * + * @param source + */ + public void setSource(String source) { + this.source = source; + } + + /** + * Returns all the other fields in the gazateer in the form of a map + * + * @return + */ + public Map<String, String> getIndexData() { + return indexData; + } + + /** + * sets the other fields in the gazeteer in the form of a map + * + * @param indexData stores all fields in the index as fieldname:value + */ + public void setIndexData(Map<String, String> indexData) { + this.indexData = indexData; + } + + @Override + public String toString() { + + return super.toString() + "\n\t\tGazateerEntry\n" + "\t\tlatitude=" + latitude + ", \n\t\tlongitude=" + longitude + ", \n\t\tsource=" + source + ", \n\t\tindexID=" + indexID + ",\n\t\tindexData=" + indexData + "\n"; + } + + @Override + public int hashCode() { + int hash = 5; + hash = 71 * hash + Objects.hashCode(this.source); + hash = 71 * hash + Objects.hashCode(this.indexID); + return hash; + } + + @Override + public boolean equals(Object obj) { + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final GazetteerEntry other = (GazetteerEntry) obj; + if (!Objects.equals(this.source, other.source)) { + return false; + } + if (!Objects.equals(this.indexID, other.indexID)) { + return false; + } + return true; + } + + + public String getCountryCode() { + return countryCode; + } + + public void setCountryCode(String countryCode) { + this.countryCode = countryCode; + } + + public String getProvinceCode() { + return provinceCode; + } + + public void setProvinceCode(String provinceCode) { + this.provinceCode = provinceCode; + } + + public String getHierarchy() { + return hierarchy; + } + + public void setHierarchy(String hierarchy) { + this.hierarchy = hierarchy; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java index ac5b01e..840af77 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java @@ -1,49 +1,51 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; - -/** - * - * Caches gazateer query results statically. Clears itself if more than 10000 results are cached. - */ -public class GazetteerSearchCache { - - private static Map<String, ArrayList<GazetteerEntry>> gazCache = new HashMap<>(); - -/** - * returns the cached entries. Returns null if the query does not exist in the cache - * @param searchString - * @return - */ - public static synchronized ArrayList<GazetteerEntry> get(String searchString) { - return gazCache.get(searchString); - } - - public static synchronized void put(String searchString, ArrayList<GazetteerEntry> hits) { - if (gazCache.size() > 10000) { - gazCache.clear(); - } - if (!gazCache.containsKey(searchString)) { - gazCache.put(searchString, hits); - } - } - - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +/** + * + * Caches gazateer query results statically. Clears itself if more than 10000 results are cached. + */ +public class GazetteerSearchCache { + + private static Map<String, ArrayList<GazetteerEntry>> gazCache = new HashMap<>(); + +/** + * returns the cached entries. Returns null if the query does not exist in the cache + * @param searchString + * @return + */ + public static synchronized ArrayList<GazetteerEntry> get(String searchString) { + return gazCache.get(searchString); + } + + public static synchronized void put(String searchString, ArrayList<GazetteerEntry> hits) { + if (gazCache.size() > 10000) { + gazCache.clear(); + } + if (!gazCache.containsKey(searchString)) { + gazCache.put(searchString, hits); + } + } + + +}
