Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java?rev=1609600&view=auto ============================================================================== --- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java (added) +++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java Fri Jul 11 01:04:58 2014 @@ -0,0 +1,188 @@ +/* + * Copyright 2014 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.addons.geoentitylinker.indexing; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; +import opennlp.addons.geoentitylinker.AdminBoundary; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; + +import org.apache.lucene.index.IndexWriter; + +/** + * + * @author mgiaconia + */ +public class USGSProcessor { + + public static void main(String[] args) { + try { + Map<String, AdminBoundary> provData = getProvData(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), GazetteerIndexer.GazType.USGS); + process(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"), null, null); + } catch (Exception ex) { + Logger.getLogger(USGSProcessor.class.getName()).log(Level.SEVERE, null, ex); + } + } + + public static void process(File lookupData, File usgsGazDataFile, File outputCountryContextfile, IndexWriter w) throws Exception { + Map<String, AdminBoundary> provData = getProvData(lookupData, GazetteerIndexer.GazType.USGS); + readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData); + writeCountryContextFile(outputCountryContextfile, provData); + } + + public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws Exception { + + BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData)); + List<String> fields = new ArrayList<>(); + int counter = 0; + System.out.println("reading gazetteer data from USGS file..........."); + String line = ""; + while ((line = reader.readLine()) != null) { + + String[] values = line.split(type.getSeparator()); + if (counter == 0) { + for (String columnName : values) { + fields.add(columnName.replace("»¿", "").trim()); + } + + } else { + Document doc = new Document(); + for (int i = 0; i < fields.size() - 1; i++) { + doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES)); + } + String placeName = values[1]; + String lat = values[9]; + String lon = values[10]; + String dsg = values[2]; + String id = values[0]; + + String ccode = values[6]; + String admincode = values[3]; + AdminBoundary get = lookupMap.get(admincode + "." + ccode); + String countyname = ""; + String countyCode = get.getCountyCode(); + if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) { + countyname = get.getCountyName(); + } + if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) { + countyCode = get.getCountyCode(); + } + String hierarchy = get.getCountryName() + ", " + get.getProvinceName() +", "+ countyname + ", " + placeName; + + doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES)); + doc.add(new TextField("placename", placeName, Field.Store.YES)); + doc.add(new TextField("latitude", lat, Field.Store.YES)); + doc.add(new TextField("longitude", lon, Field.Store.YES)); + doc.add(new TextField("loctype", dsg, Field.Store.YES)); + doc.add(new TextField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES)); + doc.add(new TextField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES)); + doc.add(new TextField("countycode", (get.getCountryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES)); + + doc.add(new TextField("locid", id, Field.Store.YES)); + doc.add(new TextField("gazsource", "usgs", Field.Store.YES)); + w.addDocument(doc); + } + counter++; + if (counter % 100000 == 0) { + w.commit(); + System.out.println(counter + " .........USGS entries committed to index.............."); + } + + } + w.commit(); + System.out.println("Completed indexing USGS gaz!"); + } + + private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) { + System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath()); + Map<String, AdminBoundary> outmap = new HashMap<>(); + BufferedReader reader; + + try { + + reader = new BufferedReader(new FileReader(govUnitsFile)); + int i = 0; + String line = ""; + String[] fields = null; + while ((line = reader.readLine()) != null) { + + String[] values = line.split(type.getSeparator()); + if (i == 0) { + fields = values; + i++; + continue; + } + i++; + // System.out.println(i); + String countyCode = values[2]; + String countyName = values[3]; + String stateCode = values[5]; + String stateName = values[6]; + String countryCode = values[7]; + String countryName = values[8]; + AdminBoundary adminBoundary = new AdminBoundary(countryCode, countryName, stateCode, stateName, countyCode, countyName); + outmap.put(stateCode + "." + countyCode, adminBoundary); + // System.out.println(adminBoundary); + + } + reader.close(); + } catch (IOException ex) { + ex.printStackTrace(); + } + System.out.println("Successfully read USGS province (State) data from: " + govUnitsFile.getPath()); + + return outmap; + + } + + public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) { + // FileWriter writer = null; + try (FileWriter writer = new FileWriter(outfile, true)) { + + for (String admkey : adms.keySet()) { + AdminBoundary adm = adms.get(admkey); + if (adm == null) { + continue; + } + String province = adm.getProvinceName(); + String country = adm.getCountryName(); + /** + * this is the standard format of the country context file... Geonames + * data will have an empty string for the county + */ + String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\n"; + writer.write(line); + /// System.out.println(line); + + } + writer.close(); + } catch (IOException ex) { + Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex); + } + System.out.println("successfully wrote USGS entries to country oontext file"); + } +}
Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java (from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java) URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java&r1=1585862&r2=1609600&rev=1609600&view=diff ============================================================================== --- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java (original) +++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java Fri Jul 11 01:04:58 2014 @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package opennlp.addons.geoentitylinker; +package opennlp.addons.geoentitylinker.scoring; import java.util.ArrayList; import java.util.HashMap; @@ -22,6 +22,7 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; +import opennlp.addons.geoentitylinker.AdminBoundaryContext; import opennlp.tools.entitylinker.EntityLinkerProperties; import opennlp.tools.entitylinker.BaseLink; import opennlp.tools.entitylinker.LinkedSpan; @@ -29,20 +30,20 @@ import opennlp.tools.util.Span; /** * Scores toponyms based on their proximity to a country mention. Based on the - * heuristic that typonymn mentions are more likely close to their parent + * heuristic that toponymn mentions are more likely close to their parent * country mentions. For instance, if the toponym Berlin is mentioned near an * indicator of Germany, it is more likely to be Berlin Germany than Berlin - * Connecticut. + * Connecticut (if Connecticut is mentioned further down in the article). * * */ -public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> { +public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> { private Map<String, Set<String>> nameCodesMap; String dominantCode = ""; @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) { + public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000); Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java (from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java) URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java&r1=1585862&r2=1609600&rev=1609600&view=diff ============================================================================== --- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java (original) +++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java Fri Jul 11 01:04:58 2014 @@ -13,43 +13,50 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package opennlp.addons.geoentitylinker; +package opennlp.addons.geoentitylinker.scoring; +import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; +import opennlp.addons.geoentitylinker.AdminBoundaryContext; +import opennlp.addons.geoentitylinker.GazetteerEntry; import opennlp.tools.entitylinker.EntityLinkerProperties; import opennlp.tools.entitylinker.BaseLink; import opennlp.tools.entitylinker.LinkedSpan; -import opennlp.tools.ngram.NGramGenerator; import opennlp.tools.util.Span; /** * * Generates scores based on string comparisons levenstein and dice */ -public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> { +public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryContext> { @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) { + public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) { for (BaseLink link : linkedSpan.getLinkedEntries()) { - Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2); - link.getScoreMap().put("dice", dice); - Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", "")); - link.getScoreMap().put("levenshtein", ld); + if (link instanceof GazetteerEntry) { + GazetteerEntry entry = (GazetteerEntry) link; + String hierarchy = entry.getHierarchy(); + if (hierarchy != null) { + Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(), 2); + link.getScoreMap().put("hierarchydicecoef", dice); + Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase().toLowerCase()); + link.getScoreMap().put("hierarchylevenshtein", ld); + } + } } } - } /** * Generates a score based on an overlap of nGrams between two strings using * the DiceCoefficient technique. * - * @param s1 first string - * @param s2 second string + * @param s1 first string + * @param s2 second string * @param nGrams number of chars in each gram * @return */ @@ -57,8 +64,22 @@ public class FuzzyStringMatchScorer impl if (s1.equals("") || s1.equals("")) { return 0d; } - List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, ""); - List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, ""); + List<String> s1Grams = new ArrayList<>(); + List<String> s2Grams = new ArrayList<>(); + String[] split1 = s1.split("[ ,]"); + for (String token : split1) { + if (token.trim().equals("")) { + continue; + } + s1Grams.add(token); + } + String[] split2 = s2.split("[ ,]"); + for (String token : split2) { + if (token.trim().equals("")) { + continue; + } + s2Grams.add(token); + } Set<String> overlap = new HashSet<String>(s1Grams); overlap.retainAll(s2Grams); Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java (from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java) URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java&r1=1585862&r2=1609600&rev=1609600&view=diff ============================================================================== --- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java (original) +++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java Fri Jul 11 01:04:58 2014 @@ -13,11 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package opennlp.addons.geoentitylinker; +package opennlp.addons.geoentitylinker.scoring; import java.util.ArrayList; import java.util.List; import java.util.Map; +import opennlp.addons.geoentitylinker.AdminBoundaryContext; +import opennlp.addons.geoentitylinker.GazetteerEntry; import opennlp.tools.entitylinker.EntityLinkerProperties; import opennlp.tools.entitylinker.BaseLink; import opennlp.tools.entitylinker.LinkedSpan; @@ -29,13 +31,13 @@ import opennlp.tools.util.Span; * outliers by finding those points that are not near the majority * */ -public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> { +public class GeoHashBinningScorer implements LinkedEntityScorer<AdminBoundaryContext> { private final PointClustering CLUSTERER = new PointClustering(); private int PRECISION = 3; @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) { + public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { //Map<Double, Double> latLongs = new HashMap<Double, Double>(); List<GazetteerEntry> allGazEntries = new ArrayList<>(); Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java (from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java) URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java&r1=1585862&r2=1609600&rev=1609600&view=diff ============================================================================== --- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java (original) +++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java Fri Jul 11 01:04:58 2014 @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package opennlp.addons.geoentitylinker; +package opennlp.addons.geoentitylinker.scoring; import java.util.List; import opennlp.tools.entitylinker.EntityLinkerProperties; @@ -23,6 +23,7 @@ import opennlp.tools.util.Span; /** * Structure for scoring linked entities. The Map logically represents a pair : * "Score type" to the "actual Score." + * @param <T> a generic for providing additional context */ public interface LinkedEntityScorer<T> { @@ -32,6 +33,7 @@ public interface LinkedEntityScorer<T> { * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored * @param docText the full text of the document. * @param sentenceSpans the sentence spans the correspond to the document text + * @param properties the entitylinker properties config file * @param additionalContext any additional data required to perform the scoring operation * @return void */ Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java (from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java) URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java&r1=1594067&r2=1609600&rev=1609600&view=diff ============================================================================== --- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java (original) +++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java Fri Jul 11 01:04:58 2014 @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package opennlp.addons.geoentitylinker; +package opennlp.addons.geoentitylinker.scoring; import java.io.File; import java.io.FileNotFoundException; @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; +import opennlp.addons.geoentitylinker.AdminBoundaryContext; import opennlp.tools.doccat.DoccatModel; import opennlp.tools.doccat.DocumentCategorizerME; import opennlp.tools.entitylinker.EntityLinkerProperties; @@ -33,7 +34,7 @@ import org.apache.log4j.Logger; * * Utilizes a doccat model to score toponyms based on surrounding context */ -public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> { +public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext> { private static final Logger LOGGER = Logger.getLogger(ModelBasedScorer.class); DocumentCategorizerME documentCategorizerME; @@ -42,7 +43,7 @@ public class ModelBasedScorer implements boolean modelexists = false; @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) { + public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { try { if (doccatModel == null) { String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", ""); Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java (from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java) URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java&r1=1594067&r2=1609600&rev=1609600&view=diff ============================================================================== --- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java (original) +++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java Fri Jul 11 01:04:58 2014 @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package opennlp.addons.geoentitylinker; +package opennlp.addons.geoentitylinker.scoring; import com.spatial4j.core.context.SpatialContext; import com.spatial4j.core.io.GeohashUtils; @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import opennlp.addons.geoentitylinker.GazetteerEntry; /** * @@ -114,36 +115,6 @@ public class PointClustering { return point; } - /** - * Hashes a lat long based on adding 90 or 180 and then interlarding lat lon - * chars. reduces a set of points to a sortable set - * - * @param lat - * @param lon - * @return - */ - public String simpleGeohash(Double lat, Double lon) { - String geoHash = ""; - lat = lat + 90; - lon = lon + 180; - String latString = String.valueOf(lat).replace(".", ""); - String lonString = String.valueOf(lon).replace(".", ""); - int length = latString.length() > lonString.length() ? lonString.length() : latString.length(); - while (length < 12) { - latString += "0"; - lonString += "0"; - length++; - } - latString = latString.substring(0, 10); - lonString = lonString.substring(0, 10); - char[] latChars = latString.toCharArray(); - char[] lonChars = lonString.toCharArray(); - - for (int i = 0; i < latChars.length; i++) { - geoHash += String.valueOf(latChars[i]) + String.valueOf(lonChars[i]); - } - return geoHash; - } private Double normalize(Double valueToNormalize, Double minimum, Double maximum) { Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0; Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java (from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java) URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java&r1=1585862&r2=1609600&rev=1609600&view=diff ============================================================================== --- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java (original) +++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java Fri Jul 11 01:04:58 2014 @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package opennlp.addons.geoentitylinker; +package opennlp.addons.geoentitylinker.scoring; import java.util.ArrayList; import java.util.HashMap; @@ -22,29 +22,39 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; -import opennlp.tools.entitylinker.EntityLinkerProperties; +import opennlp.addons.geoentitylinker.AdminBoundaryContext; +import opennlp.addons.geoentitylinker.GazetteerEntry; import opennlp.tools.entitylinker.BaseLink; +import opennlp.tools.entitylinker.EntityLinkerProperties; import opennlp.tools.entitylinker.LinkedSpan; import opennlp.tools.util.Span; /** - * Scores toponyms based on their proximity to a country mention. Based on the - * heuristic that typonymn mentions are more likely close to their parent - * country mentions. For instance, if the toponym Berlin is mentioned near an - * indicator of Germany, it is more likely to be Berlin Germany than Berlin - * Connecticut. + * Scores toponyms based on their proximity to a province mention. Based on the + * heuristic that toponymn mentions are more likely close to their parent + * province mentions. For instance, if the toponym Berlin is mentioned near an + * indicator of Connecticut, it is more likely to be Berlin Connecticut than + * Berlin Germany (if Germany did not exist in, or is mentioned further down in, + * the article). * * */ -public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> { +public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> { private Map<String, Set<String>> nameCodesMap; String dominantCode = ""; @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) { - - score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000); + public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + if (!additionalContext.getProvHits().isEmpty()) { + score(linkedSpans, additionalContext.getProvMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000); + } else { + for (LinkedSpan<BaseLink> span : linkedSpans) { + for (BaseLink link : span.getLinkedEntries()) { + link.getScoreMap().put("provincecontext", Double.NaN); + } + } + } } @@ -53,20 +63,19 @@ public class CountryProximityScorer impl * matches. Currently the scoring indicates the probability that the toponym * is correct based on the country context in the document * - * @param linkedData the linked spans, holds the Namefinder results, and - * the list of BaseLink for each - * @param countryHits all the country mentions in the document - * @param nameCodesMap maps a country indicator name to a country code. Used - * to determine if the namefinder found the same exact - * toponym the country context did. If so the score is - * boosted due to the high probability that the - * NameFinder actually "rediscovered" a country - * @param docText the full text of the document...not used in this - * default implementation - * @param sentences the sentences that correspond to the doc text. + * @param linkedData the linked spans, holds the Namefinder results, and the + * list of BaseLink for each + * @param countryHits all the country mentions in the document + * @param nameCodesMap maps a province indicator name to a province code. Used + * to determine if the namefinder found the same exact toponym the country + * context did. If so the score is boosted due to the high probability that + * the NameFinder actually "rediscovered" a country + * @param docText the full text of the document...not used in this default + * implementation + * @param sentences the sentences that correspond to the doc text. * @param maxAllowedDist a constant that is used to determine which country - * mentions, based on proximity within the text, should - * be used to score the Named Entity. + * mentions, based on proximity within the text, should be used to score the + * Named Entity. * @return */ public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { @@ -149,34 +158,35 @@ public class CountryProximityScorer impl Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span); for (BaseLink link : span.getLinkedEntries()) { //getItemParentId is the country code - String spanCountryCode = link.getItemParentID(); + GazetteerEntry entry = (GazetteerEntry)link; + String spanCountryCode = entry.getProvinceCode(); if (scoreMap.containsKey(spanCountryCode)) { score = scoreMap.get(spanCountryCode); ///does the name extracted match a country name? - if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) { + if (nameCodesMap.containsKey(entry.getItemName().toLowerCase())) { //if so, is it the correct country code for that name? - if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) { + if (nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode())) { //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1 //TODO: make this smarter, and utilize province/state info in the future to be even more specific score = (score + .75) > 1.0 ? 1d : (score + .75); - if (link.getItemParentID().equals(dominantCode)) { + if (entry.getProvinceCode().equals(dominantCode)) { score = (score + .25) > 1.0 ? 1d : (score + .25); } } } } - link.getScoreMap().put("countrycontext", score); + link.getScoreMap().put("provincecontext", score); } return span; } /** - * takes a map of distances from the toponym to each country mention and generates - * a map of scores for each country code. The map is then correlated to the - * code of the BaseLink parentid for retrieval. Then the - * score is added to the overall list. + * takes a map of distances from the toponym to each country mention and + * generates a map of scores for each country code. The map is then correlated + * to the code of the BaseLink parentid for retrieval. Then the score is added + * to the overall list. * * @param distanceMap * @param sentences @@ -211,7 +221,6 @@ public class CountryProximityScorer impl normalizedDistances.add(reverse); } - List<Double> doubles = new ArrayList<Double>(normalizedDistances); scoreMap.put(key, slidingDistanceAverage(doubles)); } @@ -257,8 +266,8 @@ public class CountryProximityScorer impl * range. Used to normalize distances in this class. * * @param valueToNormalize the value to place within the new range - * @param minimum the min of the set to be transposed - * @param maximum the max of the set to be transposed + * @param minimum the min of the set to be transposed + * @param maximum the max of the set to be transposed * @return */ private Double normalize(int valueToNormalize, int minimum, int maximum) {
