Author: markg
Date: Thu Jun 9 20:09:01 2016
New Revision: 1747587
URL: http://svn.apache.org/viewvc?rev=1747587&view=rev
Log:
OPENNLP-756
OPENNLP-750
Improved Regex handling in scorers and country context generator.
Upgraded Lucene dependency to 6.0.0
Fixed ProvinceProximityScorer and CountryProximityScorer
Fixed num rows returned bug
Added regex support to Country and Province in countrycontextfile, and added
headers for better editing in things like xl
Cleaned up some other code, will post new CountryContext file on to OPENNLP-756
All indexes should be rebuilt because of new country context file format
returned from the gazetteerIndexer class
Modified:
opennlp/addons/geoentitylinker-addon/pom.xml
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
Modified: opennlp/addons/geoentitylinker-addon/pom.xml
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/pom.xml?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/pom.xml (original)
+++ opennlp/addons/geoentitylinker-addon/pom.xml Thu Jun 9 20:09:01 2016
@@ -23,7 +23,7 @@
<parent>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp</artifactId>
- <version>1.6.0-SNAPSHOT</version>
+ <version>1.6.0</version>
<relativePath>../opennlp/pom.xml</relativePath>
</parent>
@@ -62,31 +62,33 @@
<artifactId>log4j</artifactId>
<version>1.2.16</version>
</dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-spatial</artifactId>
- <version>4.8.0</version>
- </dependency>
+
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
- <version>4.8.0</version>
+ <version>6.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
- <version>4.8.0</version>
+ <version>6.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
- <version>4.8.0</version>
+ <version>6.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>1.6.0</version>
</dependency>
+ <dependency>
+ <groupId>com.spatial4j</groupId>
+ <artifactId>spatial4j</artifactId>
+ <version>0.4.1</version>
+ <type>jar</type>
+ </dependency>
</dependencies>
</project>
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
---
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
(original)
+++
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
Thu Jun 9 20:09:01 2016
@@ -30,23 +30,22 @@ public class AdminBoundary {
private final String countryName;
private final String countyName;
private final String countyCode;
+ private final String countryRegex;
+ private final String provinceRegex;
+ private final String countyRegex;
- public AdminBoundary(String cc, String ac, String pname, String countryName)
{
- this.countryCode = cc;
- this.provinceCode = ac;
- this.provinceName = pname;
- this.countryName = countryName;
- this.countyCode = NO_DATA_FOUND_VALUE;
- this.countyName = NO_DATA_FOUND_VALUE;
- }
- public AdminBoundary(String countryCode, String countryName, String
provinceCode, String provinceName, String countyCode, String countyName) {
+ public AdminBoundary(String countryCode, String countryName, String
provinceCode, String provinceName, String countyCode, String countyName,
+ String countryRegex, String provinceRegex, String countyRegex) {
this.countryCode = countryCode;
this.provinceCode = provinceCode;
this.provinceName = provinceName;
this.countryName = countryName;
this.countyName = countyName.equals("") ? NO_DATA_FOUND_VALUE : countyName;
this.countyCode = countyCode.equals("") ? NO_DATA_FOUND_VALUE : countyCode;
+ this.countryRegex = countryRegex;
+ this.provinceRegex = provinceRegex;
+ this.countyRegex = countyRegex;
}
public String getCountryCode() {
@@ -120,4 +119,20 @@ public class AdminBoundary {
return true;
}
+ public String getProvinceCode() {
+ return provinceCode;
+ }
+
+ public String getCountryRegex() {
+ return countryRegex;
+ }
+
+ public String getProvinceRegex() {
+ return provinceRegex;
+ }
+
+ public String getCountyRegex() {
+ return countyRegex;
+ }
+
}
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
---
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
(original)
+++
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
Thu Jun 9 20:09:01 2016
@@ -29,20 +29,24 @@ public class AdminBoundaryContext {
private final Set<String> provHits;
private final Set<String> countyHits;
private final Map<String, String> countryRefMap;
+ private final Map<String, String> countryRegexMap;
+ private final Map<String, String> countyRegexMap;
+ private final Map<String, String> provinceRegexMap;
private final Map<String, Map<String, String>> provRefMap;
private final Map<String, Map<String, String>> countyRefMap;
private final Set<String> whereClauses;
private final Map<String, Set<String>> nameCodesMap;
public AdminBoundaryContext(Map<String, Set<Integer>> countryMentions,
- Map<String, Set<Integer>> provMentions,
- Map<String, Set<Integer>> countyMentions,
- Set<String> countryHits,
- Set<String> provHits,
- Set<String> countyHits,
- Map<String, String> countryRefMap,
- Map<String, Map<String, String>> provRefMap,
- Map<String, Map<String, String>> countyRefMap, Map<String,
Set<String>> nameCodesMap) {
+ Map<String, Set<Integer>> provMentions,
+ Map<String, Set<Integer>> countyMentions,
+ Set<String> countryHits,
+ Set<String> provHits,
+ Set<String> countyHits,
+ Map<String, String> countryRefMap,
+ Map<String, Map<String, String>> provRefMap,
+ Map<String, Map<String, String>> countyRefMap, Map<String, Set<String>>
nameCodesMap, Map<String, String> countryRegexMap, Map<String, String>
provinceRegexMap,
+ Map<String, String> countyRegexMap) {
this.countryMentions = countryMentions;
this.provMentions = provMentions;
this.countyMentions = countyMentions;
@@ -54,6 +58,9 @@ public class AdminBoundaryContext {
this.countyRefMap = countyRefMap;
this.whereClauses = setWhereClauses();
this.nameCodesMap = nameCodesMap;
+ this.countryRegexMap = countryRegexMap;
+ this.provinceRegexMap = provinceRegexMap;
+ this.countyRegexMap = countyRegexMap;
}
public Map<String, Set<String>> getNameCodesMap() {
@@ -131,4 +138,16 @@ public class AdminBoundaryContext {
return clauses;
}
+ public Map<String, String> getCountryRegexMap() {
+ return countryRegexMap;
+ }
+
+ public Map<String, String> getCountyRegexMap() {
+ return countyRegexMap;
+ }
+
+ public Map<String, String> getProvinceRegexMap() {
+ return provinceRegexMap;
+ }
+
}
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
---
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
(original)
+++
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
Thu Jun 9 20:09:01 2016
@@ -43,9 +43,14 @@ public class AdminBoundaryContextGenerat
private List<CountryContextEntry> countrydata;
private Map<String, Set<String>> nameCodesMap = new HashMap<>();
private Map<String, Set<Integer>> countryMentions = new HashMap<>();
+
+ Map<String, String> countryRegexMap = new HashMap<>();
+ Map<String, String> provinceRegexMap = new HashMap<>();
+ Map<String, String> countyRegexMap = new HashMap<>();
+
private Set<CountryContextEntry> countryHits = new HashSet<>();
private EntityLinkerProperties properties;
- private List<AdminBoundary> adminBoundaryData;
+ private List<AdminBoundary> adminBoundaryData= new ArrayList<>();
private Set<AdminBoundary> adminBoundaryHits = new HashSet<>();
private AdminBoundaryContext context;
@@ -70,9 +75,8 @@ public class AdminBoundaryContextGenerat
public static void main(String[] args) {
try {
- AdminBoundaryContextGenerator countryContext = new
AdminBoundaryContextGenerator(new EntityLinkerProperties(new
File("c:\\temp\\entitylinker.properties")));
- GeoEntityLinker linker = new GeoEntityLinker();
- linker.init(new EntityLinkerProperties(new
File("c:\\temp\\entitylinker.properties")));
+ AdminBoundaryContextGenerator countryContext
+ = new AdminBoundaryContextGenerator(new EntityLinkerProperties(new
File("C:\\Temp\\gaz_data\\newCountryContextfile.txt")));
AdminBoundaryContext c = countryContext.process("This artcle is about
fairfax county virginia in the north of florida in the united states. It is
also about Moscow and atlanta. Hillsborough county florida is a nice place.
Eastern Africa people are cool.");
System.out.println(c);
@@ -93,7 +97,7 @@ public class AdminBoundaryContextGenerat
throw new IOException("missing country context file");
}
//countrydata = getCountryContextFromFile(countryContextFile);
- adminBoundaryData = getContextFromFile(countryContextFile);
+ getContextFromFile(countryContextFile);
if (adminBoundaryData.isEmpty()) {
throw new IOException("missing country context data");
}
@@ -150,17 +154,17 @@ public class AdminBoundaryContextGenerat
try {
reset();
- Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap,
countryHitSet);
+ Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap,
countryHitSet, "country");
if (!countryhitMap.isEmpty()) {
for (String cc : countryhitMap.keySet()) {
Map<String, String> provsForCc = provMap.get(cc);
if (provsForCc != null) {
- provMentions.putAll(regexfind(text, provsForCc, provHits));
+ provMentions.putAll(regexfind(text, provsForCc, provHits,
"province"));
if (provMentions != null) {
for (String prov : provMentions.keySet()) {
Map<String, String> get = countyMap.get(prov);
if (get != null) {
- countyMentions.putAll(regexfind(text, get, countyHits));
+ countyMentions.putAll(regexfind(text, get, countyHits,
"province"));
}
}
}
@@ -169,7 +173,7 @@ public class AdminBoundaryContextGenerat
} else {
for (Map<String, String> provsForCc : provMap.values()) {
if (provsForCc != null) {
- provMentions = regexfind(text, provsForCc, provHits);
+ provMentions = regexfind(text, provsForCc, provHits, "province");
if (provMentions != null) {
for (String prov : provMentions.keySet()) {
//fake a country hit based on a province hit... this gets fuzzy
@@ -182,7 +186,7 @@ public class AdminBoundaryContextGenerat
}
Map<String, String> get = countyMap.get(prov);
if (get != null) {
- countyMentions = regexfind(text, get, countyHits);
+ countyMentions = regexfind(text, get, countyHits, "oounty");
}
}
}
@@ -199,7 +203,9 @@ public class AdminBoundaryContextGenerat
}
}
- AdminBoundaryContext context = new AdminBoundaryContext(countryhitMap,
provMentions, countyMentions, countryHitSet, provHits, countyHits,
countryRefMap, provMap, countyMap, nameCodesMap);
+ AdminBoundaryContext context
+ = new AdminBoundaryContext(countryhitMap, provMentions,
countyMentions, countryHitSet, provHits, countyHits,
+ countryRefMap, provMap, countyMap, nameCodesMap,
countryRegexMap, provinceRegexMap, countyRegexMap);
return context;
} catch (Exception e) {
@@ -208,7 +214,6 @@ public class AdminBoundaryContextGenerat
return null;
}
-
/**
* discovers indicators of admin boundary data using regex.
*
@@ -218,7 +223,7 @@ public class AdminBoundaryContextGenerat
* @param hitsRef a reference to a set that stores the hits by id
* @return
*/
- private Map<String, Set<Integer>> regexfind(String docText, Map<String,
String> lookupMap, Set<String> hitsRef) {
+ private Map<String, Set<Integer>> regexfind(String docText, Map<String,
String> lookupMap, Set<String> hitsRef, String locationType) {
Map<String, Set<Integer>> mentions = new HashMap<>();
if (lookupMap == null) {
return mentions;
@@ -226,10 +231,29 @@ public class AdminBoundaryContextGenerat
try {
for (String entry : lookupMap.keySet()) {
+
String name = lookupMap.get(entry).toLowerCase();
if (name == null) {
continue;
}
+ switch (locationType) {
+ case "country":
+ if (this.countryRegexMap.containsKey(entry)) {
+ name = countryRegexMap.get(entry);
+ }
+ break;
+
+ case "province":
+ if (this.provinceRegexMap.containsKey(entry)) {
+ name = provinceRegexMap.get(entry);
+ }
+ break;
+ case "county":
+ if (this.countyRegexMap.containsKey(entry)) {
+ name = countyRegexMap.get(entry);
+ }
+ break;
+ }
name = "(^|[^\\p{L}\\p{Nd}])" + name.replace(", the", "") +
"([^\\p{L}\\p{Nd}]|$)";
Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE |
Pattern.DOTALL);
Matcher rs = regex.matcher(docText);
@@ -274,39 +298,37 @@ public class AdminBoundaryContextGenerat
return mentions;
}
- private List<AdminBoundary> getContextFromFile(File countryContextFile) {
+ private void getContextFromFile(File countryContextFile) {
if (this.adminBoundaryData != null && !this.adminBoundaryData.isEmpty()) {
- return adminBoundaryData;
+ return;
}
- List<AdminBoundary> entries = new ArrayList<>();
+
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(countryContextFile));
String line = "";
+ int lineNum = 0;
while ((line = reader.readLine()) != null) {
String[] values = line.split("\t");
- int len = values.length;
- if (len < 5 || len > 6) {
- throw new IllegalArgumentException("Improperly formatted file");
+ if (lineNum == 0) {
+ lineNum++;
+ continue;
+ //skip column name headers
}
- if (values.length == 6) {
+ if (values.length == 9) {
AdminBoundary entry = new AdminBoundary(
- values[0].toLowerCase().trim().replace("", ""),
- values[3].toLowerCase().trim(),
- values[1].toLowerCase().trim(),
- values[4].toLowerCase().trim(),
- values[2].toLowerCase().trim(),
- values[5].toLowerCase().trim());
- entries.add(entry);
+ values[0].toLowerCase().trim().replace("", ""),
+ values[3].toLowerCase().trim(),
+ values[1].toLowerCase().trim(),
+ values[4].toLowerCase().trim(),
+ values[2].toLowerCase().trim(),
+ values[5].toLowerCase().trim(),
+ values[6].toLowerCase().trim(),
+ values[7].toLowerCase().trim(),
+ values[8].toLowerCase().trim());
+ this.adminBoundaryData.add(entry);
} else {
- AdminBoundary entry = new AdminBoundary(
- values[0].toLowerCase().trim().replace("", ""),
- values[3].toLowerCase().trim(),
- values[1].toLowerCase().trim(),
- values[4].toLowerCase().trim(),
- values[2].toLowerCase().trim(),
- "");
- entries.add(entry);
+ throw new IllegalArgumentException("Improperly formatted file");
}
}
@@ -314,8 +336,8 @@ public class AdminBoundaryContextGenerat
} catch (IOException ex) {
LOGGER.error(ex);
}
- loadMaps(entries);
- return entries;
+
+ loadMaps(this.adminBoundaryData);
}
@@ -323,6 +345,15 @@ public class AdminBoundaryContextGenerat
for (AdminBoundary adm : boundaries) {
if (!adm.getCountryCode().equals("null")) {
countryMap.put(adm.getCountryCode(), adm.getCountryName());
+ if (countryRegexMap.containsKey(adm.getCountryCode())) {
+ String currentRegex = countryRegexMap.get(adm.getCountryCode());
+ if (currentRegex.length() > adm.getCountryRegex().length()) {
+ // the longest one wins if they are not all the same for each
entry in the file
+ countryRegexMap.put(adm.getCountryCode(), currentRegex);
+ }//else do nothing
+ } else {
+ countryRegexMap.put(adm.getCountryCode(), adm.getCountryRegex());
+ }
if (!adm.getProvCode().equals("null")) {
Map<String, String> provs = provMap.get(adm.getCountryCode());
@@ -349,6 +380,43 @@ public class AdminBoundaryContextGenerat
}
}
}
+ fillProvRegexMap();
+ fillCountyRegexMap();
+ }
+
+ private void fillProvRegexMap() {
+ this.provinceRegexMap = new HashMap<>();
+ // this.adminBoundaryData
+ for (AdminBoundary adm : adminBoundaryData) {
+
+ if (provinceRegexMap.containsKey(adm.getProvCode())) {
+ String currentRegex = provinceRegexMap.get(adm.getProvCode());
+ if (currentRegex.length() > adm.getProvinceRegex().length()) {
+ // the longest one wins if they are not all the same for each entry
in the file
+ provinceRegexMap.put(adm.getProvCode(), currentRegex);
+ }//else do nothing
+ } else {
+ provinceRegexMap.put(adm.getProvCode(), adm.getProvinceRegex());
+ }
+ }
+ }
+
+ private void fillCountyRegexMap() {
+ this.countyRegexMap = new HashMap<>();
+ // this.adminBoundaryData
+ for (AdminBoundary adm : adminBoundaryData) {
+
+ if (countyRegexMap.containsKey(adm.getCountyCode())) {
+ String currentRegex = countyRegexMap.get(adm.getCountyCode());
+ if (currentRegex.length() > adm.getCountyRegex().length()) {
+ // the longest one wins if they are not all the same for each entry
in the file
+ countyRegexMap.put(adm.getCountyCode(), currentRegex);
+ }//else do nothing
+ } else {
+ countyRegexMap.put(adm.getCountyCode(), adm.getCountyRegex());
+ }
+ }
+
}
}
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
---
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
(original)
+++
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
Thu Jun 9 20:09:01 2016
@@ -17,6 +17,7 @@ package opennlp.addons.geoentitylinker;
import java.io.File;
import java.io.IOException;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -101,10 +102,10 @@ public class GazetteerSearcher {
* build the search string Sometimes no country context is found. In this
* case the code variables will be empty strings
*/
- String placeNameQueryString = "placename:(" + searchString.toLowerCase()
+ ")" + "AND " + whereClause;
+ String placeNameQueryString = "placename:(" + searchString.toLowerCase()
+ ") " + "AND " + whereClause;
if (searchString.trim().contains(" ") && useHierarchyField) {
placeNameQueryString = "(placename:(" + searchString.toLowerCase() +
") AND hierarchy:(" + formatForHierarchy(searchString) + "))"
- + " AND " + whereClause;
+ + " AND " + whereClause;
}
/**
@@ -118,7 +119,7 @@ public class GazetteerSearcher {
/**
* search the placename
*/
- QueryParser parser = new QueryParser(Version.LUCENE_48,
placeNameQueryString, opennlpAnalyzer);
+ QueryParser parser = new QueryParser(placeNameQueryString,
opennlpAnalyzer);
Query q = parser.parse(placeNameQueryString);
//Filter filter = new QueryWrapperFilter(new
QueryParser(Version.LUCENE_48, whereClause,
opennlpAnalyzer).parse(whereClause));
@@ -160,7 +161,7 @@ public class GazetteerSearcher {
for (int idx = 0; idx < fields.size(); idx++) {
entry.getIndexData().put(fields.get(idx).name(),
d.get(fields.get(idx).name()));
}
-
+
/**
* only want hits above the levenstein thresh. This should be a low
* thresh due to the use of the hierarchy field in the index
@@ -178,7 +179,7 @@ public class GazetteerSearcher {
}
//}
}
-
+
} catch (IOException | ParseException ex) {
LOGGER.error(ex);
}
@@ -186,8 +187,6 @@ public class GazetteerSearcher {
return linkedData;
}
-
-
/**
* Replaces any noise chars with a space, and depending on configuration adds
* double quotes to the string
@@ -215,12 +214,12 @@ public class GazetteerSearcher {
}
- opennlpIndex = new MMapDirectory(new File(indexloc));
+ opennlpIndex = new MMapDirectory(Paths.get(indexloc));
opennlpReader = DirectoryReader.open(opennlpIndex);
opennlpSearcher = new IndexSearcher(opennlpReader);
opennlpAnalyzer
- = //new StandardAnalyzer(Version.LUCENE_48, new
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
- new StandardAnalyzer(Version.LUCENE_48, new
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+ = //new StandardAnalyzer(Version.LUCENE_48, new
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+ new StandardAnalyzer(new CharArraySet(new ArrayList(), true));
Map<String, Analyzer> analyMap = new HashMap<>();
analyMap.put("countrycode", new KeywordAnalyzer());
@@ -230,7 +229,7 @@ public class GazetteerSearcher {
analyMap.put("gazsource", new KeywordAnalyzer());
opennlpAnalyzer
- = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap);
+ = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap);
String cutoff =
properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min",
String.valueOf(scoreCutoff));
String usehierarchy =
properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield",
String.valueOf("0"));
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
---
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
(original)
+++
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
Thu Jun 9 20:09:01 2016
@@ -25,6 +25,7 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
+import java.util.Iterator;
import java.util.List;
import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer;
import opennlp.addons.geoentitylinker.scoring.ProvinceProximityScorer;
@@ -59,10 +60,10 @@ public class GeoEntityLinker implements
AdminBoundaryContext context = countryContext.getContext(doctext);
for (int s = 0; s < sentences.length; s++) {
Span[] names = namesBySentence[s];
-
+
Span[] tokenSpans = tokensBySentence[s];
String[] tokens = Span.spansToStrings(tokenSpans,
sentences[s].getCoveredText(doctext));
-
+
String[] matches = Span.spansToStrings(names, tokens);
for (int i = 0; i < matches.length; i++) {
@@ -140,19 +141,30 @@ public class GeoEntityLinker implements
double sumo2 = 0d;
for (String object : o1scoreMap.keySet()) {
if (object.equals("typescore")
- || object.equals("countrycontext")
- || object.equals("placenamedicecoef")
- || object.equals("geohashbin")
- || object.equals("normlucene")) {
+ || object.equals("countrycontext")
+ || object.equals("placenamedicecoef")
+ || object.equals("provincecontext")
+ || object.equals("geohashbin")
+ || object.equals("normlucene")) {
sumo1 += o1scoreMap.get(object);
sumo2 += o2scoreMap.get(object);
}
}
return Double.compare(sumo1,
- sumo2);
+ sumo2);
}
}));
+ //prune the list to topN
+ Iterator iterator = linkedData.iterator();
+ int n = 0;
+ while (iterator.hasNext()) {
+ if (n >= topN) {
+ iterator.remove();
+ }
+ iterator.next();
+ n++;
+ }
}
return spans;
@@ -186,26 +198,26 @@ public class GeoEntityLinker implements
@Override
public void init(EntityLinkerProperties properties) throws IOException {
-
- this.linkerProperties = properties;
- countryContext = new
AdminBoundaryContextGenerator(this.linkerProperties);
- gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
- String rowsRetStr =
this.linkerProperties.getProperty("opennlp.geoentitylinker.gaz.rowsreturned",
"2");
- Integer rws = 2;
- try {
- rws = Integer.valueOf(rowsRetStr);
- } catch (NumberFormatException e) {
- rws = 2;
- }
- topN = rws;
- loadScorers();
-
+
+ this.linkerProperties = properties;
+ countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);
+ gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
+ String rowsRetStr =
this.linkerProperties.getProperty("opennlp.geoentitylinker.gaz.rowsreturned",
"2");
+ Integer rws = 2;
+ try {
+ rws = Integer.valueOf(rowsRetStr);
+ } catch (NumberFormatException e) {
+ rws = 2;
+ }
+ topN = rws;
+ loadScorers();
+
}
@Override
- public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][]
tokensBySentence,
- Span[][] namesBySentence, int sentenceIndex) {
+ public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][]
tokensBySentence,
+ Span[][] namesBySentence, int sentenceIndex) {
throw new UnsupportedOperationException("The GeoEntityLinker requires the
entire document "
- + "for proper scoring. This method is unsupported");
+ + "for proper scoring. This method is unsupported");
}
}
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
---
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
(original)
+++
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
Thu Jun 9 20:09:01 2016
@@ -17,6 +17,8 @@ package opennlp.addons.geoentitylinker.i
import java.io.File;
import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
@@ -30,7 +32,6 @@ import org.apache.lucene.index.IndexWrit
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
-import org.apache.lucene.util.Version;
/**
* Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker.
@@ -47,15 +48,15 @@ public class GazetteerIndexer {
return;
}
- File geonamesData = new File(args[0]);
- File geoNamesCountryInfo = new File(args[1]);
+ File geonamesData = new File(args[0]);
+ File geoNamesCountryInfo = new File(args[1]);
File geonamesAdmin1CodesASCII = new File(args[2]);
- File usgsDataFile = new File(args[3]);
- File usgsGovUnitsFile = new File(args[4]);
- File outputIndexDir = new File(args[5]);
- File outputCountryContextFile = new File(args[6]);
+ File usgsDataFile = new File(args[3]);
+ File usgsGovUnitsFile = new File(args[4]);
+ File outputIndexDir = new File(args[5]);
+ File outputCountryContextFile = new File(args[6]);
File regionsFile = new File(args[7]);
-
+
try {
GazetteerIndexer i = new GazetteerIndexer();
i.index(geonamesData,
@@ -83,27 +84,27 @@ public class GazetteerIndexer {
public enum GazType implements Separable {
GEONAMES {
- @Override
- public String toString() {
- return "/opennlp_geoentitylinker_geonames_idx";
- }
-
- @Override
- public String getSeparator() {
- return "\t";
- }
- },
+ @Override
+ public String toString() {
+ return "/opennlp_geoentitylinker_geonames_idx";
+ }
+
+ @Override
+ public String getSeparator() {
+ return "\t";
+ }
+ },
USGS {
- @Override
- public String toString() {
- return "/opennlp_geoentitylinker_usgsgaz_idx";
- }
-
- @Override
- public String getSeparator() {
- return "\\|";
- }
- }
+ @Override
+ public String toString() {
+ return "/opennlp_geoentitylinker_usgsgaz_idx";
+ }
+
+ @Override
+ public String getSeparator() {
+ return "\\|";
+ }
+ }
}
/**
@@ -113,7 +114,8 @@ public class GazetteerIndexer {
* 'allCountries.zip'
* @param geoNamesCountryInfo the countryinfo lookup table that can be
* downloaded from here
- * http://download.geonames.org/export/dump/countryinfo.txt
+ * http://download.geonames.org/export/dump/countryInfo.txt You'll need to
+ * copy the page into a file or scrape it
* @param geonamesAdmin1CodesASCII The lookup data for the province names for
* each place found here:
* http://download.geonames.org/export/dump/admin1CodesASCII.txt highlight
the
@@ -138,7 +140,7 @@ public class GazetteerIndexer {
* @throws Exception
*/
public void index(File geonamesData, File geoNamesCountryInfo, File
geonamesAdmin1CodesASCII,
- File usgsDataFile, File usgsGovUnitsFile, File outputIndexDir, File
outputCountryContextFile, File regionsFile) throws Exception {
+ File usgsDataFile, File usgsGovUnitsFile, File outputIndexDir, File
outputCountryContextFile, File regionsFile) throws Exception {
if (!outputIndexDir.isDirectory()) {
throw new IllegalArgumentException("outputIndexDir must be a
directory.");
}
@@ -166,8 +168,8 @@ public class GazetteerIndexer {
}
String indexloc = outputIndexDir.getPath() +
"/opennlp_geoentitylinker_gazetteer";
- Directory index = new MMapDirectory(new File(indexloc));
- Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new
CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+ Directory index = new MMapDirectory(Paths.get(indexloc));
+ Analyzer a = new StandardAnalyzer(new CharArraySet(new ArrayList(), true));
Map<String, Analyzer> analyMap = new HashMap<>();
analyMap.put("countrycode", new KeywordAnalyzer());
@@ -175,13 +177,22 @@ public class GazetteerIndexer {
analyMap.put("loctype", new KeywordAnalyzer());
analyMap.put("countycode", new KeywordAnalyzer());
analyMap.put("gazsource", new KeywordAnalyzer());
-
+
PerFieldAnalyzerWrapper aWrapper
- = new PerFieldAnalyzerWrapper(a, analyMap);
+ = new PerFieldAnalyzerWrapper(a, analyMap);
- IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48,
aWrapper);
+ IndexWriterConfig config = new IndexWriterConfig(aWrapper);
IndexWriter w = new IndexWriter(index, config);
+
+ //write the column headers for the countryContextFile
+ FileWriter countryContextFileWriter = new
FileWriter(outputCountryContextFile, false);
+ String colNamesForCountryContextFile =
"countrycode\tprovcode\tcountycode\tcountryname\tprovincename\tcountyname\tcountryregex\tprovregex\tcountyregex\n";
+ countryContextFileWriter.write(colNamesForCountryContextFile);
+ countryContextFileWriter.flush();
+ countryContextFileWriter.close();
+
+
USGSProcessor.process(usgsGovUnitsFile, usgsDataFile,
outputCountryContextFile, w);
GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII,
geonamesData, outputCountryContextFile, w);
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
---
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
(original)
+++
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
Thu Jun 9 20:09:01 2016
@@ -92,7 +92,7 @@ public class GeonamesProcessor {
if (cname == null) {
nullcodes.add(ccode);
}
- AdminBoundary data = new AdminBoundary(ccode, pcode, pname, cname);
+ AdminBoundary data = new AdminBoundary(ccode, cname, pcode, pname,
"NO_DATA_FOUND", "NO_DATA_FOUND", cname, pname, "NO_DATA_FOUND");
// System.out.println(data);
outmap.put(ccode + "." + pcode, data);
@@ -155,7 +155,8 @@ public class GeonamesProcessor {
String province = adm.getProvinceName();
String country = adm.getCountryName();
- String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" +
"" + "\t" + country + "\t" + province + "\t" + "" + "\n";
+ String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" +
"" + "\t" + country + "\t" + province + "\t" + "" + "\t" + "(" + country + ")"
+ "\t"
+ + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n";
writer.write(line);
// System.out.println(line);
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
---
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
(original)
+++
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
Thu Jun 9 20:09:01 2016
@@ -89,7 +89,8 @@ public class RegionProcessor {
//countrycontext file format
// US KY 131 United States Kentucky Leslie
- ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t"
+ "NO_DATA_FOUND_VALUE" + "\t" + "NO_DATA_FOUND_VALUE\n");
+ ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t"
+ "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\t" + "("
+ + placeName + ")" + "\t" + "NO_DATA_FOUND" + "\t" +
"NO_DATA_FOUND" + "\n");
if (w != null) {
w.addDocument(doc);
}
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
---
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
(original)
+++
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
Thu Jun 9 20:09:01 2016
@@ -82,7 +82,13 @@ public class USGSProcessor {
String admincode = values[3];
AdminBoundary get = lookupMap.get(admincode + "." + ccode);
String countyname = "";
+ if (get == null) {
+ System.out.println("null...continuing to index" + " ccode: " + ccode
+ " , admincode: " + admincode + " , placename: " + placeName);
+ continue;
+
+ }
String countyCode = get.getCountyCode();
+
if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {
countyname = get.getCountyName();
}
@@ -125,8 +131,7 @@ public class USGSProcessor {
}
}
-
-
+
for (String state : states.keySet()) {
StateCentroid get = states.get(state);
Document doc = new Document();
@@ -143,8 +148,8 @@ public class USGSProcessor {
doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES));
doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
w.addDocument(doc);
-
- // System.out.println(get.statecode + "," + (get.latSum / get.count) +
"," + (get.longSum / get.count));
+
+ // System.out.println(get.statecode + "," + (get.latSum / get.count) +
"," + (get.longSum / get.count));
}
Document doc = new Document();
doc.add(new TextField("hierarchy", "united states", Field.Store.YES));
@@ -202,7 +207,7 @@ public class USGSProcessor {
String stateName = values[6];
String countryCode = values[7];
String countryName = values[8];
- AdminBoundary adminBoundary = new AdminBoundary(countryCode,
countryName, stateCode, stateName, countyCode, countyName);
+ AdminBoundary adminBoundary = new AdminBoundary(countryCode,
countryName, stateCode, stateName, countyCode, countyName, null, null, null);
outmap.put(stateCode + "." + countyCode, adminBoundary);
// System.out.println(adminBoundary);
@@ -232,7 +237,8 @@ public class USGSProcessor {
* this is the standard format of the country context file... Geonames
* data will have an empty string for the county
*/
- String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" +
adm.getCountyCode() + "\t" + country + "\t" + province + "\t" +
adm.getCountyName() + "\n";
+ String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" +
adm.getCountyCode() + "\t" + country + "\t" + province + "\t" +
adm.getCountyName() + "\t"
+ + "(U\\.S\\.[ $]|U\\.S\\.A\\.[ $]|United States|the US[ $]|a us[
$])" + "\t" + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n";
writer.write(line);
/// System.out.println(line);
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
---
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
(original)
+++
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
Thu Jun 9 20:09:01 2016
@@ -22,6 +22,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
+import java.util.regex.Pattern;
import opennlp.addons.geoentitylinker.AdminBoundaryContext;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.BaseLink;
@@ -41,10 +42,12 @@ public class CountryProximityScorer impl
private Map<String, Set<String>> nameCodesMap;
String dominantCode = "";
+ private Map<String, String> regexMap = new HashMap<>();
@Override
public void score(List<LinkedSpan> linkedSpans, String docText, Span[]
sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext
additionalContext) {
+ regexMap = additionalContext.getCountryRegexMap();
score(linkedSpans, additionalContext.getCountryMentions(),
additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
}
@@ -54,20 +57,19 @@ public class CountryProximityScorer impl
* matches. Currently the scoring indicates the probability that the toponym
* is correct based on the country context in the document
*
- * @param linkedData the linked spans, holds the Namefinder results, and
- * the list of BaseLink for each
- * @param countryHits all the country mentions in the document
- * @param nameCodesMap maps a country indicator name to a country code.
Used
- * to determine if the namefinder found the same exact
- * toponym the country context did. If so the score is
- * boosted due to the high probability that the
- * NameFinder actually "rediscovered" a country
- * @param docText the full text of the document...not used in this
- * default implementation
- * @param sentences the sentences that correspond to the doc text.
+ * @param linkedData the linked spans, holds the Namefinder results, and the
+ * list of BaseLink for each
+ * @param countryHits all the country mentions in the document
+ * @param nameCodesMap maps a country indicator name to a country code. Used
+ * to determine if the namefinder found the same exact toponym the country
+ * context did. If so the score is boosted due to the high probability that
+ * the NameFinder actually "rediscovered" a country
+ * @param docText the full text of the document...not used in this default
+ * implementation
+ * @param sentences the sentences that correspond to the doc text.
* @param maxAllowedDist a constant that is used to determine which country
- * mentions, based on proximity within the text, should
- * be used to score the Named Entity.
+ * mentions, based on proximity within the text, should be used to score the
+ * Named Entity.
* @return
*/
public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String,
Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String
docText, Span[] sentences, Integer maxAllowedDist) {
@@ -155,11 +157,10 @@ public class CountryProximityScorer impl
score = scoreMap.get(spanCountryCode);
///does the name extracted match a country name?
- if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {
+ if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) ||
regexMatch(link.getItemName(), link.getItemParentID())) {
//if so, is it the correct country code for that name?
if
(nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID()))
{
//boost the score becuase it is likely that this is the location
in the text, so add 50% to the score or set to 1
- //TODO: make this smarter, and utilize province/state info in the
future to be even more specific
score = (score + .75) > 1.0 ? 1d : (score + .75);
if (link.getItemParentID().equals(dominantCode)) {
@@ -168,17 +169,17 @@ public class CountryProximityScorer impl
}
}
}
-
+
link.getScoreMap().put("countrycontext", score);
}
return span;
}
/**
- * takes a map of distances from the toponym to each country mention and
generates
- * a map of scores for each country code. The map is then correlated to the
- * code of the BaseLink parentid for retrieval. Then the
- * score is added to the overall list.
+ * takes a map of distances from the toponym to each country mention and
+ * generates a map of scores for each country code. The map is then
correlated
+ * to the code of the BaseLink parentid for retrieval. Then the score is
added
+ * to the overall list.
*
* @param distanceMap
* @param sentences
@@ -213,13 +214,22 @@ public class CountryProximityScorer impl
normalizedDistances.add(reverse);
}
-
List<Double> doubles = new ArrayList<Double>(normalizedDistances);
scoreMap.put(key, slidingDistanceAverage(doubles));
}
return scoreMap;
}
+ private boolean regexMatch(String placeName, String countryCode) {
+ if (regexMap.containsKey(countryCode)) {
+ String regexForCountry = regexMap.get(countryCode);
+
+ Pattern p =
Pattern.compile(regexForCountry,Pattern.DOTALL|Pattern.CASE_INSENSITIVE);
+ return p.matcher(placeName.trim()).matches();
+ }
+ return false;
+ }
+
/**
* this method is an attempt to make closer clusters of mentions group
* together to smooth out the average, so one distant outlier does not kill
@@ -259,8 +269,8 @@ public class CountryProximityScorer impl
* range. Used to normalize distances in this class.
*
* @param valueToNormalize the value to place within the new range
- * @param minimum the min of the set to be transposed
- * @param maximum the max of the set to be transposed
+ * @param minimum the min of the set to be transposed
+ * @param maximum the max of the set to be transposed
* @return
*/
private Double normalize(int valueToNormalize, int minimum, int maximum) {
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
URL:
http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java?rev=1747587&r1=1747586&r2=1747587&view=diff
==============================================================================
---
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
(original)
+++
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
Thu Jun 9 20:09:01 2016
@@ -22,6 +22,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
+import java.util.regex.Pattern;
import opennlp.addons.geoentitylinker.AdminBoundaryContext;
import opennlp.addons.geoentitylinker.GazetteerEntry;
import opennlp.tools.entitylinker.BaseLink;
@@ -43,10 +44,12 @@ public class ProvinceProximityScorer imp
private Map<String, Set<String>> nameCodesMap;
String dominantCode = "";
+ private Map<String, String> regexMap = new HashMap<>();
@Override
public void score(List<LinkedSpan> linkedSpans, String docText, Span[]
sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext
additionalContext) {
if (!additionalContext.getProvHits().isEmpty()) {
+ regexMap = additionalContext.getProvinceRegexMap();
score(linkedSpans, additionalContext.getProvMentions(),
additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
} else {
for (LinkedSpan<BaseLink> span : linkedSpans) {
@@ -167,11 +170,11 @@ public class ProvinceProximityScorer imp
score = scoreMap.get(spanCountryCode);
///does the name extracted match a province name?
- if (nameCodesMap.containsKey(entry.getItemName().toLowerCase())) {
+ if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) ||
regexMatch(link.getItemName(), link.getItemParentID())) {
//if so, is it the correct country code for that name?
if
(nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode()))
{
//boost the score becuase it is likely that this is the location
in the text, so add 50% to the score or set to 1
- //TODO: make this smarter, and utilize province/state info in the
future to be even more specific
+ //TODO: make this smarter
score = (score + .75) > 1.0 ? 1d : (score + .75);
if (entry.getProvinceCode().equals(dominantCode)) {
@@ -185,6 +188,16 @@ public class ProvinceProximityScorer imp
return span;
}
+ private boolean regexMatch(String placeName, String countryCode) {
+ if (regexMap.containsKey(countryCode)) {
+ String regexForCountry = regexMap.get(countryCode);
+
+ Pattern p = Pattern.compile(regexForCountry, Pattern.DOTALL |
Pattern.CASE_INSENSITIVE);
+ return p.matcher(placeName.trim()).matches();
+ }
+ return false;
+ }
+
/**
* takes a map of distances from the toponym to each province mention and
* generates a map of scores for each province code. The map is then