Repository: opennlp-addons Updated Branches: refs/heads/master 6c142dbf6 -> 9adc2525c
http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java index dede558..971ffa0 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java @@ -1,82 +1,84 @@ -/* - * Copyright 2014 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker.scoring; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import opennlp.addons.geoentitylinker.AdminBoundaryContext; -import opennlp.addons.geoentitylinker.GazetteerEntry; -import opennlp.tools.entitylinker.EntityLinkerProperties; -import opennlp.tools.entitylinker.LinkedSpan; -import opennlp.tools.util.Span; - -/** - * - * @author mgiaconia - */ -public class PlacetypeScorer implements LinkedEntityScorer<AdminBoundaryContext> { - - private static final String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT civil Populated_Place".split(" "); - private Map<String, Double> boosetedTypes = new HashMap<>(); - - public PlacetypeScorer() { - fillMap(); - } - - @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { - for (LinkedSpan<GazetteerEntry> geospan : linkedSpans) { - ArrayList<GazetteerEntry> linkedEntries = geospan.getLinkedEntries(); - for (GazetteerEntry gazetteerEntry : linkedEntries) { - String type = gazetteerEntry.getItemType().toLowerCase(); - Double score = getScore(type); - if (score == null) { - score = 0d; - } - gazetteerEntry.getScoreMap().put("typescore", score); - } - } - } - - private Double getScore(String type) { - Double ret = boosetedTypes.get(type.toLowerCase()); - return ret == null ? 0d : ret; - } - - private void fillMap() { - if (boosetedTypes.isEmpty()) { - for (String type : boosts) { - if (type.equals("PCLI")) { - boosetedTypes.put(type.toLowerCase(), 1d); - } else if ((type.startsWith("PC")|| type.startsWith("PP")) && !type.equals("PCLI")) { - boosetedTypes.put(type.toLowerCase(), .5d); - } else if (type.startsWith("ADM")) { - boosetedTypes.put(type.toLowerCase(), .75d); - }else if (type.toLowerCase().equals("civil")){ - boosetedTypes.put(type.toLowerCase(), .9d); - } - if(type.toLowerCase().equals("populated_place")){ - boosetedTypes.put("Populated Place", .75d); - } - - } - boosetedTypes.put("pplc", .9); - } - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker.scoring; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import opennlp.addons.geoentitylinker.AdminBoundaryContext; +import opennlp.addons.geoentitylinker.GazetteerEntry; +import opennlp.tools.entitylinker.EntityLinkerProperties; +import opennlp.tools.entitylinker.LinkedSpan; +import opennlp.tools.util.Span; + +/** + * + * @author mgiaconia + */ +public class PlacetypeScorer implements LinkedEntityScorer<AdminBoundaryContext> { + + private static final String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT civil Populated_Place".split(" "); + private Map<String, Double> boosetedTypes = new HashMap<>(); + + public PlacetypeScorer() { + fillMap(); + } + + @Override + public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + for (LinkedSpan<GazetteerEntry> geospan : linkedSpans) { + ArrayList<GazetteerEntry> linkedEntries = geospan.getLinkedEntries(); + for (GazetteerEntry gazetteerEntry : linkedEntries) { + String type = gazetteerEntry.getItemType().toLowerCase(); + Double score = getScore(type); + if (score == null) { + score = 0d; + } + gazetteerEntry.getScoreMap().put("typescore", score); + } + } + } + + private Double getScore(String type) { + Double ret = boosetedTypes.get(type.toLowerCase()); + return ret == null ? 0d : ret; + } + + private void fillMap() { + if (boosetedTypes.isEmpty()) { + for (String type : boosts) { + if (type.equals("PCLI")) { + boosetedTypes.put(type.toLowerCase(), 1d); + } else if ((type.startsWith("PC")|| type.startsWith("PP")) && !type.equals("PCLI")) { + boosetedTypes.put(type.toLowerCase(), .5d); + } else if (type.startsWith("ADM")) { + boosetedTypes.put(type.toLowerCase(), .75d); + }else if (type.toLowerCase().equals("civil")){ + boosetedTypes.put(type.toLowerCase(), .9d); + } + if(type.toLowerCase().equals("populated_place")){ + boosetedTypes.put("Populated Place", .75d); + } + + } + boosetedTypes.put("pplc", .9); + } + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java index 908df1e..2cd6299 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java @@ -1,123 +1,127 @@ -/* - * Copyright 2014 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker.scoring; - -import com.spatial4j.core.context.SpatialContext; -import com.spatial4j.core.io.GeohashUtils; -import com.spatial4j.core.shape.Point; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import opennlp.addons.geoentitylinker.GazetteerEntry; - -/** - * - * Clusters a list of lat long points using a simple geohashing/boxing approach - */ -public class PointClustering { - - /** - * Clusters a set of points from the gazateers. The idea is that locations - * that matched a name that are closer to each other, the more likely the - * toponym is to be accurate - * - * @param entries - * @param precision - * @return - */ - public Map<String, List<GazetteerEntry>> cluster(List<GazetteerEntry> entries, int precision) { - Map<String, List<GazetteerEntry>> map = new HashMap<>(); - for (int i = 0; i < entries.size(); i++) { - GazetteerEntry entry = entries.get(i); - Double latw = entry.getLatitude(); - Double lonw = entry.getLongitude(); - - String key = geoHash(latw, lonw).substring(0, precision); - if (map.containsKey(key)) { - map.get(key).add(entry); - } else { - List<GazetteerEntry> newlist = new ArrayList<>(); - newlist.add(entry); - map.put(key, newlist); - } - } - return map; - } - - public void scoreClusters(Map<String, List<GazetteerEntry>> clusters) { - Double min = 0d; - Double max = -1d; - for (String key : clusters.keySet()) { - int size = clusters.get(key).size(); - if (size > max) { - max = Double.valueOf(size); - } - } - for (String key : clusters.keySet()) { - int size = clusters.get(key).size(); - Double score = normalize(Double.valueOf(size), min, max); - for (GazetteerEntry entry : clusters.get(key)) { - entry.getScoreMap().put("geohashbin", score); - } - } - - } - - /** - * Returns a geohash based on Lucene Spatial - * - * @param lat the input latitude Y - * @param lon the input longitude X - * @return - */ - public String geoHash(Double lat, Double lon) { - String encodeLatLon = GeohashUtils.encodeLatLon(lat, lon); - return encodeLatLon; - } - - /** - * Returns the X and Y point for the geohash. Element 0 is the X (longitude) - * element 1 is the Y (latitude) - * - * @param geohash - * @return - */ - public double[] geoHashToPoint(String geohash) { - Point decode = GeohashUtils.decode(geohash, SpatialContext.GEO); - double[] coords = new double[]{decode.getX(), decode.getY()}; - return coords; - } - - /** - * Returns the X and Y point for the geohash. Element 0 is the X (longitude) - * element 1 is the Y (latitude) - * - * @param geohash - * @return - */ - public String geoHashToPointStr(String geohash) { - Point decode = GeohashUtils.decode(geohash, SpatialContext.GEO); - String point = decode.getX() + "," + decode.getY(); - return point; - } - - - private Double normalize(Double valueToNormalize, Double minimum, Double maximum) { - Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0; - return d; - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker.scoring; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import com.spatial4j.core.context.SpatialContext; +import com.spatial4j.core.io.GeohashUtils; +import com.spatial4j.core.shape.Point; + +import opennlp.addons.geoentitylinker.GazetteerEntry; + +/** + * + * Clusters a list of lat long points using a simple geohashing/boxing approach + */ +public class PointClustering { + + /** + * Clusters a set of points from the gazateers. The idea is that locations + * that matched a name that are closer to each other, the more likely the + * toponym is to be accurate + * + * @param entries + * @param precision + * @return + */ + public Map<String, List<GazetteerEntry>> cluster(List<GazetteerEntry> entries, int precision) { + Map<String, List<GazetteerEntry>> map = new HashMap<>(); + for (int i = 0; i < entries.size(); i++) { + GazetteerEntry entry = entries.get(i); + Double latw = entry.getLatitude(); + Double lonw = entry.getLongitude(); + + String key = geoHash(latw, lonw).substring(0, precision); + if (map.containsKey(key)) { + map.get(key).add(entry); + } else { + List<GazetteerEntry> newlist = new ArrayList<>(); + newlist.add(entry); + map.put(key, newlist); + } + } + return map; + } + + public void scoreClusters(Map<String, List<GazetteerEntry>> clusters) { + Double min = 0d; + Double max = -1d; + for (String key : clusters.keySet()) { + int size = clusters.get(key).size(); + if (size > max) { + max = Double.valueOf(size); + } + } + for (String key : clusters.keySet()) { + int size = clusters.get(key).size(); + Double score = normalize(Double.valueOf(size), min, max); + for (GazetteerEntry entry : clusters.get(key)) { + entry.getScoreMap().put("geohashbin", score); + } + } + + } + + /** + * Returns a geohash based on Lucene Spatial + * + * @param lat the input latitude Y + * @param lon the input longitude X + * @return + */ + public String geoHash(Double lat, Double lon) { + String encodeLatLon = GeohashUtils.encodeLatLon(lat, lon); + return encodeLatLon; + } + + /** + * Returns the X and Y point for the geohash. Element 0 is the X (longitude) + * element 1 is the Y (latitude) + * + * @param geohash + * @return + */ + public double[] geoHashToPoint(String geohash) { + Point decode = GeohashUtils.decode(geohash, SpatialContext.GEO); + double[] coords = new double[]{decode.getX(), decode.getY()}; + return coords; + } + + /** + * Returns the X and Y point for the geohash. Element 0 is the X (longitude) + * element 1 is the Y (latitude) + * + * @param geohash + * @return + */ + public String geoHashToPointStr(String geohash) { + Point decode = GeohashUtils.decode(geohash, SpatialContext.GEO); + String point = decode.getX() + "," + decode.getY(); + return point; + } + + + private Double normalize(Double valueToNormalize, Double minimum, Double maximum) { + Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0; + return d; + } +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java index d9e7d19..6e10343 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java @@ -1,294 +1,296 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker.scoring; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; -import java.util.regex.Pattern; -import opennlp.addons.geoentitylinker.AdminBoundaryContext; -import opennlp.addons.geoentitylinker.GazetteerEntry; -import opennlp.tools.entitylinker.BaseLink; -import opennlp.tools.entitylinker.EntityLinkerProperties; -import opennlp.tools.entitylinker.LinkedSpan; -import opennlp.tools.util.Span; - -/** - * Scores toponyms based on their proximity to a province mention. Based on the - * heuristic that toponymn mentions are more likely close to their parent - * province mentions. For instance, if the toponym Berlin is mentioned near an - * indicator of Connecticut, it is more likely to be Berlin Connecticut than - * Berlin Germany (if Germany did not exist in, or is mentioned further away in - * the article). - * - * - */ -public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> { - - private Map<String, Set<String>> nameCodesMap; - String dominantCode = ""; - private Map<String, String> regexMap = new HashMap<>(); - - @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { - if (!additionalContext.getProvHits().isEmpty()) { - regexMap = additionalContext.getProvinceRegexMap(); - score(linkedSpans, additionalContext.getProvMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000); - } else { - for (LinkedSpan<BaseLink> span : linkedSpans) { - for (BaseLink link : span.getLinkedEntries()) { - link.getScoreMap().put("provincecontext", 0d); - } - } - } - - } - - /** - * Assigns a score to each BaseLink in each linkedSpan's set of N best - * matches. Currently the scoring indicates the probability that the toponym - * is correct based on the country context in the document - * - * @param linkedData the linked spans, holds the Namefinder results, and the - * list of BaseLink for each - * @param countryHits all the country mentions in the document - * @param nameCodesMap maps a province indicator name to a province code. Used - * to determine if the namefinder found the same exact toponym the country - * context did. If so the score is boosted due to the high probability that - * the NameFinder actually "rediscovered" a country - * @param docText the full text of the document...not used in this default - * implementation - * @param sentences the sentences that correspond to the doc text. - * @param maxAllowedDist a constant that is used to determine which country - * mentions, based on proximity within the text, should be used to score the - * Named Entity. - * @return - */ - public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { - this.nameCodesMap = nameCodesMap; - setDominantCode(countryHits); - for (LinkedSpan<BaseLink> linkedspan : linkedData) { - - linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist); - } - return linkedData; - } - - /** - * sets class level variable to a code based on the number of mentions - * - * @param countryHits - */ - private void setDominantCode(Map<String, Set<Integer>> countryHits) { - int hits = -1; - for (String code : countryHits.keySet()) { - if (countryHits.get(code).size() > hits) { - hits = countryHits.get(code).size(); - dominantCode = code; - } - } - } - - /** - * Generates distances from each country mention to the span's location in the - * doc text. Ultimately an attempt to ensure that ambiguously named toponyms - * are resolved to the correct country and coordinate. - * - * @param sentences - * @param countryHits - * @param span - * @return - */ - private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) { - Double score = 0.0; - /* - * get the index of the actual span, begining of sentence //should generate - * tokens from sentence and create a char offset... //could have large - * sentences due to poor sentence detection or wonky doc text - */ - int sentenceIdx = span.getSentenceid(); - int sentIndexInDoc = sentences[sentenceIdx].getStart(); - /** - * create a map of all the span's proximal country mentions in the document - * Map< countrycode, set of <distances from this NamedEntity>> - */ - Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>(); - //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>> - for (String cCode : countryHits.keySet()) { -//iterate over all the regex start values and calculate an offset - for (Integer cHit : countryHits.get(cCode)) { - Integer absDist = Math.abs(sentIndexInDoc - cHit); - //only include near mentions based on a heuristic - //TODO make this a property - // if (absDist < maxAllowedDistance) { - if (distancesFromCodeMap.containsKey(cCode)) { - distancesFromCodeMap.get(cCode).add(absDist); - } else { - HashSet<Integer> newset = new HashSet<Integer>(); - newset.add(absDist); - distancesFromCodeMap.put(cCode, newset); - } - } - - //} - } - //we now know how far this named entity is from every country mention in the document - - /** - * the gaz matches that have a country code that have mentions in the doc - * that are closest to the Named Entity should return the best score. - * Analyzemap generates a likelihood score that the toponym from the gaz is - * referring to one of the countries, i.e, Map<countrycode, prob that this - * span is referring to the toponym form this code key> - */ - Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span); - if (scoreMap.isEmpty()) { - return span; - } - for (BaseLink link : span.getLinkedEntries()) { - //getItemParentId is the country code - GazetteerEntry entry = (GazetteerEntry) link; - String spanCountryCode = entry.getProvinceCode(); - if (scoreMap.containsKey(spanCountryCode)) { - - score = scoreMap.get(spanCountryCode); - ///does the name extracted match a province name? - if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) { - //if so, is it the correct country code for that name? - if (nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode())) { - //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1 - //TODO: make this smarter - score = (score + .75) > 1.0 ? 1d : (score + .75); - - if (entry.getProvinceCode().equals(dominantCode)) { - score = (score + .25) > 1.0 ? 1d : (score + .25); - } - } - } - } - link.getScoreMap().put("provincecontext", score); - } - return span; - } - - private boolean regexMatch(String placeName, String countryCode) { - if (regexMap.containsKey(countryCode)) { - String regexForCountry = regexMap.get(countryCode); - - Pattern p = Pattern.compile(regexForCountry, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); - return p.matcher(placeName.trim()).matches(); - } - return false; - } - - /** - * takes a map of distances from the toponym to each province mention and - * generates a map of scores for each province code. The map is then - * correlated to the code of the BaseLink parentid for retrieval. Then the - * score is added to the overall list. - * - * @param distanceMap - * @param sentences - * @param span - * @return - */ - private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) { - - Map<String, Double> scoreMap = new HashMap<String, Double>(); - if (distanceMap.isEmpty()) { - return scoreMap; - } - TreeSet<Integer> all = new TreeSet<Integer>(); - for (String key : distanceMap.keySet()) { - all.addAll(distanceMap.get(key)); - } - //get min max for normalization, this could be more efficient - - Integer min = all.first(); - Integer max = all.last(); - if (min == max) { - min = 0; - } - for (String key : distanceMap.keySet()) { - - TreeSet<Double> normalizedDistances = new TreeSet<Double>(); - for (Integer i : distanceMap.get(key)) { - Double norm = normalize(i, min, max); - //reverse the normed distance so low numbers (closer) are better - //this could be improved with a "decaying " function using an imcreaseing negative exponent - Double reverse = Math.abs(norm - 1); - normalizedDistances.add(reverse); - } - - List<Double> doubles = new ArrayList<Double>(normalizedDistances); - scoreMap.put(key, slidingDistanceAverage(doubles)); - } - return scoreMap; - } - - /** - * this method is an attempt to make closer clusters of mentions group - * together to smooth out the average, so one distant outlier does not kill - * the score for an obviously good hit. More elegant solution is possible - * using Math.pow, and making the score decay with distance by using an - * increasing negative exponent - * - * @param normDis the normalized and sorted set of distances as a list - * @return - */ - private Double slidingDistanceAverage(List<Double> normDis) { - List<Double> windowOfAverages = new ArrayList<Double>(); - - if (normDis.size() < 3) { - windowOfAverages.addAll(normDis); - } else { - - for (int i = 0; i < normDis.size() - 1; i++) { - double a = normDis.get(i); - double b = normDis.get(i + 1); - windowOfAverages.add((a + b) / 2); - - } - } - double sum = 0d; - for (double d : windowOfAverages) { - sum += d; - } - double result = sum / windowOfAverages.size(); - //TODO: ++ prob when large amounts of mentions for a code - //System.out.println("avg of window:" + result); - return result; - } - - /** - * transposes a value within one range to a relative value in a different - * range. Used to normalize distances in this class. - * - * @param valueToNormalize the value to place within the new range - * @param minimum the min of the set to be transposed - * @param maximum the max of the set to be transposed - * @return - */ - private Double normalize(int valueToNormalize, int minimum, int maximum) { - Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0; - d = d == null ? 0d : d; - return d; - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker.scoring; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.regex.Pattern; +import opennlp.addons.geoentitylinker.AdminBoundaryContext; +import opennlp.addons.geoentitylinker.GazetteerEntry; +import opennlp.tools.entitylinker.BaseLink; +import opennlp.tools.entitylinker.EntityLinkerProperties; +import opennlp.tools.entitylinker.LinkedSpan; +import opennlp.tools.util.Span; + +/** + * Scores toponyms based on their proximity to a province mention. Based on the + * heuristic that toponymn mentions are more likely close to their parent + * province mentions. For instance, if the toponym Berlin is mentioned near an + * indicator of Connecticut, it is more likely to be Berlin Connecticut than + * Berlin Germany (if Germany did not exist in, or is mentioned further away in + * the article). + * + * + */ +public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> { + + private Map<String, Set<String>> nameCodesMap; + String dominantCode = ""; + private Map<String, String> regexMap = new HashMap<>(); + + @Override + public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + if (!additionalContext.getProvHits().isEmpty()) { + regexMap = additionalContext.getProvinceRegexMap(); + score(linkedSpans, additionalContext.getProvMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000); + } else { + for (LinkedSpan<BaseLink> span : linkedSpans) { + for (BaseLink link : span.getLinkedEntries()) { + link.getScoreMap().put("provincecontext", 0d); + } + } + } + + } + + /** + * Assigns a score to each BaseLink in each linkedSpan's set of N best + * matches. Currently the scoring indicates the probability that the toponym + * is correct based on the country context in the document + * + * @param linkedData the linked spans, holds the Namefinder results, and the + * list of BaseLink for each + * @param countryHits all the country mentions in the document + * @param nameCodesMap maps a province indicator name to a province code. Used + * to determine if the namefinder found the same exact toponym the country + * context did. If so the score is boosted due to the high probability that + * the NameFinder actually "rediscovered" a country + * @param docText the full text of the document...not used in this default + * implementation + * @param sentences the sentences that correspond to the doc text. + * @param maxAllowedDist a constant that is used to determine which country + * mentions, based on proximity within the text, should be used to score the + * Named Entity. + * @return + */ + public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { + this.nameCodesMap = nameCodesMap; + setDominantCode(countryHits); + for (LinkedSpan<BaseLink> linkedspan : linkedData) { + + linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist); + } + return linkedData; + } + + /** + * sets class level variable to a code based on the number of mentions + * + * @param countryHits + */ + private void setDominantCode(Map<String, Set<Integer>> countryHits) { + int hits = -1; + for (String code : countryHits.keySet()) { + if (countryHits.get(code).size() > hits) { + hits = countryHits.get(code).size(); + dominantCode = code; + } + } + } + + /** + * Generates distances from each country mention to the span's location in the + * doc text. Ultimately an attempt to ensure that ambiguously named toponyms + * are resolved to the correct country and coordinate. + * + * @param sentences + * @param countryHits + * @param span + * @return + */ + private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) { + Double score = 0.0; + /* + * get the index of the actual span, begining of sentence //should generate + * tokens from sentence and create a char offset... //could have large + * sentences due to poor sentence detection or wonky doc text + */ + int sentenceIdx = span.getSentenceid(); + int sentIndexInDoc = sentences[sentenceIdx].getStart(); + /** + * create a map of all the span's proximal country mentions in the document + * Map< countrycode, set of <distances from this NamedEntity>> + */ + Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>(); + //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>> + for (String cCode : countryHits.keySet()) { +//iterate over all the regex start values and calculate an offset + for (Integer cHit : countryHits.get(cCode)) { + Integer absDist = Math.abs(sentIndexInDoc - cHit); + //only include near mentions based on a heuristic + //TODO make this a property + // if (absDist < maxAllowedDistance) { + if (distancesFromCodeMap.containsKey(cCode)) { + distancesFromCodeMap.get(cCode).add(absDist); + } else { + HashSet<Integer> newset = new HashSet<Integer>(); + newset.add(absDist); + distancesFromCodeMap.put(cCode, newset); + } + } + + //} + } + //we now know how far this named entity is from every country mention in the document + + /** + * the gaz matches that have a country code that have mentions in the doc + * that are closest to the Named Entity should return the best score. + * Analyzemap generates a likelihood score that the toponym from the gaz is + * referring to one of the countries, i.e, Map<countrycode, prob that this + * span is referring to the toponym form this code key> + */ + Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span); + if (scoreMap.isEmpty()) { + return span; + } + for (BaseLink link : span.getLinkedEntries()) { + //getItemParentId is the country code + GazetteerEntry entry = (GazetteerEntry) link; + String spanCountryCode = entry.getProvinceCode(); + if (scoreMap.containsKey(spanCountryCode)) { + + score = scoreMap.get(spanCountryCode); + ///does the name extracted match a province name? + if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) { + //if so, is it the correct country code for that name? + if (nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode())) { + //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1 + //TODO: make this smarter + score = (score + .75) > 1.0 ? 1d : (score + .75); + + if (entry.getProvinceCode().equals(dominantCode)) { + score = (score + .25) > 1.0 ? 1d : (score + .25); + } + } + } + } + link.getScoreMap().put("provincecontext", score); + } + return span; + } + + private boolean regexMatch(String placeName, String countryCode) { + if (regexMap.containsKey(countryCode)) { + String regexForCountry = regexMap.get(countryCode); + + Pattern p = Pattern.compile(regexForCountry, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); + return p.matcher(placeName.trim()).matches(); + } + return false; + } + + /** + * takes a map of distances from the toponym to each province mention and + * generates a map of scores for each province code. The map is then + * correlated to the code of the BaseLink parentid for retrieval. Then the + * score is added to the overall list. + * + * @param distanceMap + * @param sentences + * @param span + * @return + */ + private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) { + + Map<String, Double> scoreMap = new HashMap<String, Double>(); + if (distanceMap.isEmpty()) { + return scoreMap; + } + TreeSet<Integer> all = new TreeSet<Integer>(); + for (String key : distanceMap.keySet()) { + all.addAll(distanceMap.get(key)); + } + //get min max for normalization, this could be more efficient + + Integer min = all.first(); + Integer max = all.last(); + if (min == max) { + min = 0; + } + for (String key : distanceMap.keySet()) { + + TreeSet<Double> normalizedDistances = new TreeSet<Double>(); + for (Integer i : distanceMap.get(key)) { + Double norm = normalize(i, min, max); + //reverse the normed distance so low numbers (closer) are better + //this could be improved with a "decaying " function using an imcreaseing negative exponent + Double reverse = Math.abs(norm - 1); + normalizedDistances.add(reverse); + } + + List<Double> doubles = new ArrayList<Double>(normalizedDistances); + scoreMap.put(key, slidingDistanceAverage(doubles)); + } + return scoreMap; + } + + /** + * this method is an attempt to make closer clusters of mentions group + * together to smooth out the average, so one distant outlier does not kill + * the score for an obviously good hit. More elegant solution is possible + * using Math.pow, and making the score decay with distance by using an + * increasing negative exponent + * + * @param normDis the normalized and sorted set of distances as a list + * @return + */ + private Double slidingDistanceAverage(List<Double> normDis) { + List<Double> windowOfAverages = new ArrayList<Double>(); + + if (normDis.size() < 3) { + windowOfAverages.addAll(normDis); + } else { + + for (int i = 0; i < normDis.size() - 1; i++) { + double a = normDis.get(i); + double b = normDis.get(i + 1); + windowOfAverages.add((a + b) / 2); + + } + } + double sum = 0d; + for (double d : windowOfAverages) { + sum += d; + } + double result = sum / windowOfAverages.size(); + //TODO: ++ prob when large amounts of mentions for a code + //System.out.println("avg of window:" + result); + return result; + } + + /** + * transposes a value within one range to a relative value in a different + * range. Used to normalize distances in this class. + * + * @param valueToNormalize the value to place within the new range + * @param minimum the min of the set to be transposed + * @param maximum the max of the set to be transposed + * @return + */ + private Double normalize(int valueToNormalize, int minimum, int maximum) { + Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0; + d = d == null ? 0d : d; + return d; + } +}
