http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java index f457822..027efc2 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java @@ -1,113 +1,115 @@ -/* - * Copyright 2014 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker.indexing; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.FileWriter; -import java.util.ArrayList; -import java.util.List; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; -import org.apache.lucene.index.IndexWriter; - -public class RegionProcessor { - - public static void main(String[] args) { - RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null); - } - - /** - * - * @param regionsFile the file that stores Region references. the format of - * this file is tab delimitted text with index 0 as the name of the region, - * index 1 as the longitude, and index 2 as the latitude - * @param outputCountryContextfile this is the country context files shared by - * all indexing processors - * @param w - */ - public static void process(File regionsFile, File outputCountryContextfile, IndexWriter w) { - try { - readFile(regionsFile, outputCountryContextfile, w); - } catch (Exception ex) { - ex.printStackTrace(); - } - } - - public static void readFile(File gazateerInputData, File outputCountryContextfile, IndexWriter w) throws Exception { - List<String> ccfileentries = new ArrayList<>(); - BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData)); - List<String> fields = new ArrayList<>(); - int counter = 0; - System.out.println("reading gazetteer data from Regions file..........."); - String line = ""; - while ((line = reader.readLine()) != null) { - - String[] values = line.split("\t"); - if (counter == 0) { - - } else { - Document doc = new Document(); - for (int i = 0; i < fields.size() - 1; i++) { - doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES)); - } - String placeName = values[0]; - String lat = values[2]; - String lon = values[1]; - String dsg = "region"; - String id = "rg" + counter; - - String hierarchy = placeName; - - doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES)); - doc.add(new TextField("placename", placeName, Field.Store.YES)); - doc.add(new StringField("latitude", lat, Field.Store.YES)); - doc.add(new StringField("longitude", lon, Field.Store.YES)); - doc.add(new StringField("loctype", dsg, Field.Store.YES)); - doc.add(new StringField("admincode", "", Field.Store.YES)); - doc.add(new StringField("countrycode", id, Field.Store.YES)); - doc.add(new StringField("countycode", "", Field.Store.YES)); - - doc.add(new StringField("locid", id, Field.Store.YES)); - doc.add(new StringField("gazsource", "region", Field.Store.YES)); - //countrycontext file format - // US KY 131 United States Kentucky Leslie - - ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\t" + "(" - + placeName + ")" + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\n"); - if (w != null) { - w.addDocument(doc); - } - } - counter++; - - } - if (w != null) { - w.commit(); - } - FileWriter writer = new FileWriter(outputCountryContextfile, true); - for (String string : ccfileentries) { - writer.write(string); - } - System.out.println("successfully wrote Region entries to country oontext file"); - writer.close(); - System.out.println("Completed indexing regions!"); - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker.indexing; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; + +public class RegionProcessor { + + public static void main(String[] args) { + RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null); + } + + /** + * + * @param regionsFile the file that stores Region references. the format of + * this file is tab delimitted text with index 0 as the name of the region, + * index 1 as the longitude, and index 2 as the latitude + * @param outputCountryContextfile this is the country context files shared by + * all indexing processors + * @param w + */ + public static void process(File regionsFile, File outputCountryContextfile, IndexWriter w) { + try { + readFile(regionsFile, outputCountryContextfile, w); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + + public static void readFile(File gazateerInputData, File outputCountryContextfile, IndexWriter w) throws Exception { + List<String> ccfileentries = new ArrayList<>(); + BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData)); + List<String> fields = new ArrayList<>(); + int counter = 0; + System.out.println("reading gazetteer data from Regions file..........."); + String line = ""; + while ((line = reader.readLine()) != null) { + + String[] values = line.split("\t"); + if (counter == 0) { + + } else { + Document doc = new Document(); + for (int i = 0; i < fields.size() - 1; i++) { + doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES)); + } + String placeName = values[0]; + String lat = values[2]; + String lon = values[1]; + String dsg = "region"; + String id = "rg" + counter; + + String hierarchy = placeName; + + doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES)); + doc.add(new TextField("placename", placeName, Field.Store.YES)); + doc.add(new StringField("latitude", lat, Field.Store.YES)); + doc.add(new StringField("longitude", lon, Field.Store.YES)); + doc.add(new StringField("loctype", dsg, Field.Store.YES)); + doc.add(new StringField("admincode", "", Field.Store.YES)); + doc.add(new StringField("countrycode", id, Field.Store.YES)); + doc.add(new StringField("countycode", "", Field.Store.YES)); + + doc.add(new StringField("locid", id, Field.Store.YES)); + doc.add(new StringField("gazsource", "region", Field.Store.YES)); + //countrycontext file format + // US KY 131 United States Kentucky Leslie + + ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\t" + "(" + + placeName + ")" + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\n"); + if (w != null) { + w.addDocument(doc); + } + } + counter++; + + } + if (w != null) { + w.commit(); + } + FileWriter writer = new FileWriter(outputCountryContextfile, true); + for (String string : ccfileentries) { + writer.write(string); + } + System.out.println("successfully wrote Region entries to country oontext file"); + writer.close(); + System.out.println("Completed indexing regions!"); + } + +}
http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java index fcd61c1..61b2120 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java @@ -1,251 +1,254 @@ -/* - * Copyright 2014 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker.indexing; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.logging.Level; -import java.util.logging.Logger; -import opennlp.addons.geoentitylinker.AdminBoundary; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; - -import org.apache.lucene.index.IndexWriter; - -public class USGSProcessor { - - public static void main(String[] args) { - try { - Map<String, AdminBoundary> provData = getProvData(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), GazetteerIndexer.GazType.USGS); - process(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"), null, null); - } catch (Exception ex) { - Logger.getLogger(USGSProcessor.class.getName()).log(Level.SEVERE, null, ex); - } - } - - public static void process(File lookupData, File usgsGazDataFile, File outputCountryContextfile, IndexWriter w) throws Exception { - Map<String, AdminBoundary> provData = getProvData(lookupData, GazetteerIndexer.GazType.USGS); - readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData); - writeCountryContextFile(outputCountryContextfile, provData); - } - - public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws Exception { - - Map<String, StateCentroid> states = new HashMap<>(); - BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData)); - List<String> fields = new ArrayList<>(); - int counter = 0; - System.out.println("reading gazetteer data from USGS file..........."); - String line = ""; - while ((line = reader.readLine()) != null) { - - String[] values = line.split(type.getSeparator()); - if (counter == 0) { - for (String columnName : values) { - fields.add(columnName.replace("»¿", "").trim()); - } - - } else { - Document doc = new Document(); - for (int i = 0; i < fields.size() - 1; i++) { - doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES)); - } - String placeName = values[1]; - String lat = values[9]; - String lon = values[10]; - String dsg = values[2]; - String id = values[0]; - - String ccode = values[6]; - String admincode = values[3]; - AdminBoundary get = lookupMap.get(admincode + "." + ccode); - String countyname = ""; - if (get == null) { - System.out.println("null...continuing to index" + " ccode: " + ccode + " , admincode: " + admincode + " , placename: " + placeName); - continue; - - } - String countyCode = get.getCountyCode(); - - if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) { - countyname = get.getCountyName(); - } - if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) { - countyCode = get.getCountyCode(); - } - String hierarchy = get.getCountryName() + ", " + get.getProvinceName() + ", " + countyname + ", " + placeName; - - if (states.containsKey(get.getProvinceName())) { - StateCentroid entry = states.get(get.getProvinceName()); - entry.count++; - entry.latSum += Double.valueOf(lat); - entry.longSum += Double.valueOf(lon); - } else { - StateCentroid centroid = new StateCentroid(); - centroid.statecode = get.getProvCode(); - centroid.count = 1; - centroid.latSum = Double.valueOf(lat); - centroid.longSum = Double.valueOf(lon); - states.put(get.getProvinceName(), centroid); - } - - doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES)); - doc.add(new TextField("placename", placeName, Field.Store.YES)); - doc.add(new TextField("latitude", lat, Field.Store.YES)); - doc.add(new TextField("longitude", lon, Field.Store.YES)); - doc.add(new StringField("loctype", dsg, Field.Store.YES)); - doc.add(new StringField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES)); - doc.add(new StringField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES)); - doc.add(new StringField("countycode", (get.getCountryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES)); - - doc.add(new StringField("locid", id, Field.Store.YES)); - doc.add(new StringField("gazsource", "usgs", Field.Store.YES)); - w.addDocument(doc); - } - counter++; - if (counter % 100000 == 0) { - w.commit(); - System.out.println(counter + " .........USGS entries committed to index.............."); - } - - } - - for (String state : states.keySet()) { - StateCentroid get = states.get(state); - Document doc = new Document(); - doc.add(new TextField("hierarchy", "united states, " + state, Field.Store.YES)); - doc.add(new TextField("placename", state, Field.Store.YES)); - //calculate a centroid for all the points that were in the state - doc.add(new TextField("latitude", (get.latSum / get.count) + "", Field.Store.YES)); - doc.add(new TextField("longitude", (get.longSum / get.count) + "", Field.Store.YES)); - doc.add(new StringField("loctype", "adm1", Field.Store.YES)); - doc.add(new StringField("admincode", get.statecode, Field.Store.YES)); - doc.add(new StringField("countrycode", "us", Field.Store.YES)); - doc.add(new StringField("countycode", "", Field.Store.YES)); - - doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES)); - doc.add(new StringField("gazsource", "usgs", Field.Store.YES)); - w.addDocument(doc); - - // System.out.println(get.statecode + "," + (get.latSum / get.count) + "," + (get.longSum / get.count)); - } - Document doc = new Document(); - doc.add(new TextField("hierarchy", "united states", Field.Store.YES)); - doc.add(new TextField("placename", "united states", Field.Store.YES)); - //calculate a centroid for all the points that were in the state - doc.add(new TextField("latitude", 39.0 + "", Field.Store.YES)); - doc.add(new TextField("longitude", -103.0 + "", Field.Store.YES)); - doc.add(new StringField("loctype", "pcli", Field.Store.YES)); - doc.add(new StringField("admincode", "", Field.Store.YES)); - doc.add(new StringField("countrycode", "us", Field.Store.YES)); - doc.add(new StringField("countycode", "", Field.Store.YES)); - - doc.add(new StringField("locid", "us_centroid" + "unitedstates", Field.Store.YES)); - doc.add(new StringField("gazsource", "usgs", Field.Store.YES)); - //System.out.println("uscentroid," + (sumofLatSums / sumOfCounts) + "," + (sumofLonSums / sumOfCounts)); - - w.addDocument(doc); - w.commit(); - - System.out.println("Completed indexing USGS gaz!"); - } - - private static class StateCentroid { - - double latSum; - double longSum; - String statecode; - int count; - } - - private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) { - System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath()); - Map<String, AdminBoundary> outmap = new HashMap<>(); - BufferedReader reader; - - try { - - reader = new BufferedReader(new FileReader(govUnitsFile)); - int i = 0; - String line = ""; - String[] fields = null; - while ((line = reader.readLine()) != null) { - - String[] values = line.split(type.getSeparator()); - if (i == 0) { - fields = values; - i++; - continue; - } - i++; - // System.out.println(i); - String countyCode = values[2]; - String countyName = values[3]; - String stateCode = values[5]; - String stateName = values[6]; - String countryCode = values[7]; - String countryName = values[8]; - AdminBoundary adminBoundary = new AdminBoundary(countryCode, countryName, stateCode, stateName, countyCode, countyName, null, null, null); - outmap.put(stateCode + "." + countyCode, adminBoundary); - // System.out.println(adminBoundary); - - } - reader.close(); - } catch (IOException ex) { - ex.printStackTrace(); - } - System.out.println("Successfully read USGS province (State) data from: " + govUnitsFile.getPath()); - - return outmap; - - } - - public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) { - // FileWriter writer = null; - try (FileWriter writer = new FileWriter(outfile, true)) { - - for (String admkey : adms.keySet()) { - AdminBoundary adm = adms.get(admkey); - if (adm == null) { - continue; - } - String province = adm.getProvinceName(); - String country = adm.getCountryName(); - /** - * this is the standard format of the country context file... Geonames - * data will have an empty string for the county - */ - String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\t" - + "(U\\.S\\.[ $]|U\\.S\\.A\\.[ $]|United States|the US[ $]|a us[ $])" + "\t" + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n"; - writer.write(line); - /// System.out.println(line); - - } - } catch (IOException ex) { - Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex); - } - System.out.println("successfully wrote USGS entries to country context file"); - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker.indexing; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; + +import opennlp.addons.geoentitylinker.AdminBoundary; + +public class USGSProcessor { + + public static void main(String[] args) { + try { + Map<String, AdminBoundary> provData = getProvData(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), GazetteerIndexer.GazType.USGS); + process(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"), null, null); + } catch (Exception ex) { + Logger.getLogger(USGSProcessor.class.getName()).log(Level.SEVERE, null, ex); + } + } + + public static void process(File lookupData, File usgsGazDataFile, File outputCountryContextfile, IndexWriter w) throws Exception { + Map<String, AdminBoundary> provData = getProvData(lookupData, GazetteerIndexer.GazType.USGS); + readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData); + writeCountryContextFile(outputCountryContextfile, provData); + } + + public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws Exception { + + Map<String, StateCentroid> states = new HashMap<>(); + BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData)); + List<String> fields = new ArrayList<>(); + int counter = 0; + System.out.println("reading gazetteer data from USGS file..........."); + String line = ""; + while ((line = reader.readLine()) != null) { + + String[] values = line.split(type.getSeparator()); + if (counter == 0) { + for (String columnName : values) { + fields.add(columnName.replace("»¿", "").trim()); + } + + } else { + Document doc = new Document(); + for (int i = 0; i < fields.size() - 1; i++) { + doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES)); + } + String placeName = values[1]; + String lat = values[9]; + String lon = values[10]; + String dsg = values[2]; + String id = values[0]; + + String ccode = values[6]; + String admincode = values[3]; + AdminBoundary get = lookupMap.get(admincode + "." + ccode); + String countyname = ""; + if (get == null) { + System.out.println("null...continuing to index" + " ccode: " + ccode + " , admincode: " + admincode + " , placename: " + placeName); + continue; + + } + String countyCode = get.getCountyCode(); + + if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) { + countyname = get.getCountyName(); + } + if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) { + countyCode = get.getCountyCode(); + } + String hierarchy = get.getCountryName() + ", " + get.getProvinceName() + ", " + countyname + ", " + placeName; + + if (states.containsKey(get.getProvinceName())) { + StateCentroid entry = states.get(get.getProvinceName()); + entry.count++; + entry.latSum += Double.valueOf(lat); + entry.longSum += Double.valueOf(lon); + } else { + StateCentroid centroid = new StateCentroid(); + centroid.statecode = get.getProvCode(); + centroid.count = 1; + centroid.latSum = Double.valueOf(lat); + centroid.longSum = Double.valueOf(lon); + states.put(get.getProvinceName(), centroid); + } + + doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES)); + doc.add(new TextField("placename", placeName, Field.Store.YES)); + doc.add(new TextField("latitude", lat, Field.Store.YES)); + doc.add(new TextField("longitude", lon, Field.Store.YES)); + doc.add(new StringField("loctype", dsg, Field.Store.YES)); + doc.add(new StringField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES)); + doc.add(new StringField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES)); + doc.add(new StringField("countycode", (get.getCountryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES)); + + doc.add(new StringField("locid", id, Field.Store.YES)); + doc.add(new StringField("gazsource", "usgs", Field.Store.YES)); + w.addDocument(doc); + } + counter++; + if (counter % 100000 == 0) { + w.commit(); + System.out.println(counter + " .........USGS entries committed to index.............."); + } + + } + + for (String state : states.keySet()) { + StateCentroid get = states.get(state); + Document doc = new Document(); + doc.add(new TextField("hierarchy", "united states, " + state, Field.Store.YES)); + doc.add(new TextField("placename", state, Field.Store.YES)); + //calculate a centroid for all the points that were in the state + doc.add(new TextField("latitude", (get.latSum / get.count) + "", Field.Store.YES)); + doc.add(new TextField("longitude", (get.longSum / get.count) + "", Field.Store.YES)); + doc.add(new StringField("loctype", "adm1", Field.Store.YES)); + doc.add(new StringField("admincode", get.statecode, Field.Store.YES)); + doc.add(new StringField("countrycode", "us", Field.Store.YES)); + doc.add(new StringField("countycode", "", Field.Store.YES)); + + doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES)); + doc.add(new StringField("gazsource", "usgs", Field.Store.YES)); + w.addDocument(doc); + + // System.out.println(get.statecode + "," + (get.latSum / get.count) + "," + (get.longSum / get.count)); + } + Document doc = new Document(); + doc.add(new TextField("hierarchy", "united states", Field.Store.YES)); + doc.add(new TextField("placename", "united states", Field.Store.YES)); + //calculate a centroid for all the points that were in the state + doc.add(new TextField("latitude", 39.0 + "", Field.Store.YES)); + doc.add(new TextField("longitude", -103.0 + "", Field.Store.YES)); + doc.add(new StringField("loctype", "pcli", Field.Store.YES)); + doc.add(new StringField("admincode", "", Field.Store.YES)); + doc.add(new StringField("countrycode", "us", Field.Store.YES)); + doc.add(new StringField("countycode", "", Field.Store.YES)); + + doc.add(new StringField("locid", "us_centroid" + "unitedstates", Field.Store.YES)); + doc.add(new StringField("gazsource", "usgs", Field.Store.YES)); + //System.out.println("uscentroid," + (sumofLatSums / sumOfCounts) + "," + (sumofLonSums / sumOfCounts)); + + w.addDocument(doc); + w.commit(); + + System.out.println("Completed indexing USGS gaz!"); + } + + private static class StateCentroid { + + double latSum; + double longSum; + String statecode; + int count; + } + + private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) { + System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath()); + Map<String, AdminBoundary> outmap = new HashMap<>(); + BufferedReader reader; + + try { + + reader = new BufferedReader(new FileReader(govUnitsFile)); + int i = 0; + String line = ""; + String[] fields = null; + while ((line = reader.readLine()) != null) { + + String[] values = line.split(type.getSeparator()); + if (i == 0) { + fields = values; + i++; + continue; + } + i++; + // System.out.println(i); + String countyCode = values[2]; + String countyName = values[3]; + String stateCode = values[5]; + String stateName = values[6]; + String countryCode = values[7]; + String countryName = values[8]; + AdminBoundary adminBoundary = new AdminBoundary(countryCode, countryName, stateCode, stateName, countyCode, countyName, null, null, null); + outmap.put(stateCode + "." + countyCode, adminBoundary); + // System.out.println(adminBoundary); + + } + reader.close(); + } catch (IOException ex) { + ex.printStackTrace(); + } + System.out.println("Successfully read USGS province (State) data from: " + govUnitsFile.getPath()); + + return outmap; + + } + + public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) { + // FileWriter writer = null; + try (FileWriter writer = new FileWriter(outfile, true)) { + + for (String admkey : adms.keySet()) { + AdminBoundary adm = adms.get(admkey); + if (adm == null) { + continue; + } + String province = adm.getProvinceName(); + String country = adm.getCountryName(); + /** + * this is the standard format of the country context file... Geonames + * data will have an empty string for the county + */ + String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\t" + + "(U\\.S\\.[ $]|U\\.S\\.A\\.[ $]|United States|the US[ $]|a us[ $])" + "\t" + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n"; + writer.write(line); + /// System.out.println(line); + + } + } catch (IOException ex) { + Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex); + } + System.out.println("successfully wrote USGS entries to country context file"); + } +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java index aea8f9b..98c9715 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java @@ -1,281 +1,283 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker.scoring; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; -import java.util.regex.Pattern; -import opennlp.addons.geoentitylinker.AdminBoundaryContext; -import opennlp.tools.entitylinker.EntityLinkerProperties; -import opennlp.tools.entitylinker.BaseLink; -import opennlp.tools.entitylinker.LinkedSpan; -import opennlp.tools.util.Span; - -/** - * Scores toponyms based on their proximity to a country mention. Based on the - * heuristic that toponymn mentions are more likely close to their parent - * country mentions. For instance, if the toponym Berlin is mentioned near an - * indicator of Germany, it is more likely to be Berlin Germany than Berlin - * Connecticut (if Connecticut is mentioned further down in the article). - * - * - */ -public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> { - - private Map<String, Set<String>> nameCodesMap; - String dominantCode = ""; - private Map<String, String> regexMap = new HashMap<>(); - - @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { - - regexMap = additionalContext.getCountryRegexMap(); - score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000); - - } - - /** - * Assigns a score to each BaseLink in each linkedSpan's set of N best - * matches. Currently the scoring indicates the probability that the toponym - * is correct based on the country context in the document - * - * @param linkedData the linked spans, holds the Namefinder results, and the - * list of BaseLink for each - * @param countryHits all the country mentions in the document - * @param nameCodesMap maps a country indicator name to a country code. Used - * to determine if the namefinder found the same exact toponym the country - * context did. If so the score is boosted due to the high probability that - * the NameFinder actually "rediscovered" a country - * @param docText the full text of the document...not used in this default - * implementation - * @param sentences the sentences that correspond to the doc text. - * @param maxAllowedDist a constant that is used to determine which country - * mentions, based on proximity within the text, should be used to score the - * Named Entity. - * @return - */ - public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { - this.nameCodesMap = nameCodesMap; - setDominantCode(countryHits); - for (LinkedSpan<BaseLink> linkedspan : linkedData) { - - linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist); - } - return linkedData; - } - - /** - * sets class level variable to a code based on the number of mentions - * - * @param countryHits - */ - private void setDominantCode(Map<String, Set<Integer>> countryHits) { - int hits = -1; - for (String code : countryHits.keySet()) { - if (countryHits.get(code).size() > hits) { - hits = countryHits.get(code).size(); - dominantCode = code; - } - } - } - - /** - * Generates distances from each country mention to the span's location in the - * doc text. Ultimately an attempt to ensure that ambiguously named toponyms - * are resolved to the correct country and coordinate. - * - * @param sentences - * @param countryHits - * @param span - * @return - */ - private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) { - Double score = 0.0; - /* - * get the index of the actual span, begining of sentence //should generate - * tokens from sentence and create a char offset... //could have large - * sentences due to poor sentence detection or wonky doc text - */ - int sentenceIdx = span.getSentenceid(); - int sentIndexInDoc = sentences[sentenceIdx].getStart(); - /** - * create a map of all the span's proximal country mentions in the document - * Map< countrycode, set of <distances from this NamedEntity>> - */ - Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>(); - //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>> - for (String cCode : countryHits.keySet()) { -//iterate over all the regex start values and calculate an offset - for (Integer cHit : countryHits.get(cCode)) { - Integer absDist = Math.abs(sentIndexInDoc - cHit); - //only include near mentions based on a heuristic - //TODO make this a property - // if (absDist < maxAllowedDistance) { - if (distancesFromCodeMap.containsKey(cCode)) { - distancesFromCodeMap.get(cCode).add(absDist); - } else { - HashSet<Integer> newset = new HashSet<Integer>(); - newset.add(absDist); - distancesFromCodeMap.put(cCode, newset); - } - } - - //} - } - //we now know how far this named entity is from every country mention in the document - - /** - * the gaz matches that have a country code that have mentions in the doc - * that are closest to the Named Entity should return the best score. - * Analyzemap generates a likelihood score that the toponym from the gaz is - * referring to one of the countries, i.e, Map<countrycode, prob that this - * span is referring to the toponym form this code key> - */ - Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span); - for (BaseLink link : span.getLinkedEntries()) { - //getItemParentId is the country code - String spanCountryCode = link.getItemParentID(); - if (scoreMap.containsKey(spanCountryCode)) { - - score = scoreMap.get(spanCountryCode); - ///does the name extracted match a country name? - if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) { - //if so, is it the correct country code for that name? - if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) { - //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1 - score = (score + .75) > 1.0 ? 1d : (score + .75); - - if (link.getItemParentID().equals(dominantCode)) { - score = (score + .25) > 1.0 ? 1d : (score + .25); - } - } - } - } - - link.getScoreMap().put("countrycontext", score); - } - return span; - } - - /** - * takes a map of distances from the toponym to each country mention and - * generates a map of scores for each country code. The map is then correlated - * to the code of the BaseLink parentid for retrieval. Then the score is added - * to the overall list. - * - * @param distanceMap - * @param sentences - * @param span - * @return - */ - private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) { - - Map<String, Double> scoreMap = new HashMap<String, Double>(); - if (distanceMap.isEmpty()) { - return scoreMap; - } - TreeSet<Integer> all = new TreeSet<Integer>(); - for (String key : distanceMap.keySet()) { - all.addAll(distanceMap.get(key)); - } - //get min max for normalization, this could be more efficient - - Integer min = all.first(); - Integer max = all.last(); - if (min == max) { - min = 0; - } - for (String key : distanceMap.keySet()) { - - TreeSet<Double> normalizedDistances = new TreeSet<Double>(); - for (Integer i : distanceMap.get(key)) { - Double norm = normalize(i, min, max); - //reverse the normed distance so low numbers (closer) are better - //this could be improved with a "decaying " function using an imcreaseing negative exponent - Double reverse = Math.abs(norm - 1); - normalizedDistances.add(reverse); - } - - List<Double> doubles = new ArrayList<Double>(normalizedDistances); - scoreMap.put(key, slidingDistanceAverage(doubles)); - } - return scoreMap; - } - - private boolean regexMatch(String placeName, String countryCode) { - if (regexMap.containsKey(countryCode)) { - String regexForCountry = regexMap.get(countryCode); - - Pattern p = Pattern.compile(regexForCountry,Pattern.DOTALL|Pattern.CASE_INSENSITIVE); - return p.matcher(placeName.trim()).matches(); - } - return false; - } - - /** - * this method is an attempt to make closer clusters of mentions group - * together to smooth out the average, so one distant outlier does not kill - * the score for an obviously good hit. More elegant solution is possible - * using Math.pow, and making the score decay with distance by using an - * increasing negative exponent (I think) - * - * @param normDis the normalized and sorted set of distances as a list - * @return - */ - private Double slidingDistanceAverage(List<Double> normDis) { - List<Double> windowOfAverages = new ArrayList<Double>(); - - if (normDis.size() < 3) { - windowOfAverages.addAll(normDis); - } else { - - for (int i = 0; i < normDis.size() - 1; i++) { - double a = normDis.get(i); - double b = normDis.get(i + 1); - windowOfAverages.add((a + b) / 2); - - } - } - double sum = 0d; - for (double d : windowOfAverages) { - sum += d; - } - double result = sum / windowOfAverages.size(); - //TODO: ++ prob when large amounts of mentions for a code - //System.out.println("avg of window:" + result); - return result; - } - - /** - * transposes a value within one range to a relative value in a different - * range. Used to normalize distances in this class. - * - * @param valueToNormalize the value to place within the new range - * @param minimum the min of the set to be transposed - * @param maximum the max of the set to be transposed - * @return - */ - private Double normalize(int valueToNormalize, int minimum, int maximum) { - Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0; - d = d == null ? 0d : d; - return d; - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker.scoring; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.regex.Pattern; +import opennlp.addons.geoentitylinker.AdminBoundaryContext; +import opennlp.tools.entitylinker.EntityLinkerProperties; +import opennlp.tools.entitylinker.BaseLink; +import opennlp.tools.entitylinker.LinkedSpan; +import opennlp.tools.util.Span; + +/** + * Scores toponyms based on their proximity to a country mention. Based on the + * heuristic that toponymn mentions are more likely close to their parent + * country mentions. For instance, if the toponym Berlin is mentioned near an + * indicator of Germany, it is more likely to be Berlin Germany than Berlin + * Connecticut (if Connecticut is mentioned further down in the article). + * + * + */ +public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> { + + private Map<String, Set<String>> nameCodesMap; + String dominantCode = ""; + private Map<String, String> regexMap = new HashMap<>(); + + @Override + public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + + regexMap = additionalContext.getCountryRegexMap(); + score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000); + + } + + /** + * Assigns a score to each BaseLink in each linkedSpan's set of N best + * matches. Currently the scoring indicates the probability that the toponym + * is correct based on the country context in the document + * + * @param linkedData the linked spans, holds the Namefinder results, and the + * list of BaseLink for each + * @param countryHits all the country mentions in the document + * @param nameCodesMap maps a country indicator name to a country code. Used + * to determine if the namefinder found the same exact toponym the country + * context did. If so the score is boosted due to the high probability that + * the NameFinder actually "rediscovered" a country + * @param docText the full text of the document...not used in this default + * implementation + * @param sentences the sentences that correspond to the doc text. + * @param maxAllowedDist a constant that is used to determine which country + * mentions, based on proximity within the text, should be used to score the + * Named Entity. + * @return + */ + public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { + this.nameCodesMap = nameCodesMap; + setDominantCode(countryHits); + for (LinkedSpan<BaseLink> linkedspan : linkedData) { + + linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist); + } + return linkedData; + } + + /** + * sets class level variable to a code based on the number of mentions + * + * @param countryHits + */ + private void setDominantCode(Map<String, Set<Integer>> countryHits) { + int hits = -1; + for (String code : countryHits.keySet()) { + if (countryHits.get(code).size() > hits) { + hits = countryHits.get(code).size(); + dominantCode = code; + } + } + } + + /** + * Generates distances from each country mention to the span's location in the + * doc text. Ultimately an attempt to ensure that ambiguously named toponyms + * are resolved to the correct country and coordinate. + * + * @param sentences + * @param countryHits + * @param span + * @return + */ + private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) { + Double score = 0.0; + /* + * get the index of the actual span, begining of sentence //should generate + * tokens from sentence and create a char offset... //could have large + * sentences due to poor sentence detection or wonky doc text + */ + int sentenceIdx = span.getSentenceid(); + int sentIndexInDoc = sentences[sentenceIdx].getStart(); + /** + * create a map of all the span's proximal country mentions in the document + * Map< countrycode, set of <distances from this NamedEntity>> + */ + Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>(); + //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>> + for (String cCode : countryHits.keySet()) { +//iterate over all the regex start values and calculate an offset + for (Integer cHit : countryHits.get(cCode)) { + Integer absDist = Math.abs(sentIndexInDoc - cHit); + //only include near mentions based on a heuristic + //TODO make this a property + // if (absDist < maxAllowedDistance) { + if (distancesFromCodeMap.containsKey(cCode)) { + distancesFromCodeMap.get(cCode).add(absDist); + } else { + HashSet<Integer> newset = new HashSet<Integer>(); + newset.add(absDist); + distancesFromCodeMap.put(cCode, newset); + } + } + + //} + } + //we now know how far this named entity is from every country mention in the document + + /** + * the gaz matches that have a country code that have mentions in the doc + * that are closest to the Named Entity should return the best score. + * Analyzemap generates a likelihood score that the toponym from the gaz is + * referring to one of the countries, i.e, Map<countrycode, prob that this + * span is referring to the toponym form this code key> + */ + Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span); + for (BaseLink link : span.getLinkedEntries()) { + //getItemParentId is the country code + String spanCountryCode = link.getItemParentID(); + if (scoreMap.containsKey(spanCountryCode)) { + + score = scoreMap.get(spanCountryCode); + ///does the name extracted match a country name? + if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) { + //if so, is it the correct country code for that name? + if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) { + //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1 + score = (score + .75) > 1.0 ? 1d : (score + .75); + + if (link.getItemParentID().equals(dominantCode)) { + score = (score + .25) > 1.0 ? 1d : (score + .25); + } + } + } + } + + link.getScoreMap().put("countrycontext", score); + } + return span; + } + + /** + * takes a map of distances from the toponym to each country mention and + * generates a map of scores for each country code. The map is then correlated + * to the code of the BaseLink parentid for retrieval. Then the score is added + * to the overall list. + * + * @param distanceMap + * @param sentences + * @param span + * @return + */ + private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) { + + Map<String, Double> scoreMap = new HashMap<String, Double>(); + if (distanceMap.isEmpty()) { + return scoreMap; + } + TreeSet<Integer> all = new TreeSet<Integer>(); + for (String key : distanceMap.keySet()) { + all.addAll(distanceMap.get(key)); + } + //get min max for normalization, this could be more efficient + + Integer min = all.first(); + Integer max = all.last(); + if (min == max) { + min = 0; + } + for (String key : distanceMap.keySet()) { + + TreeSet<Double> normalizedDistances = new TreeSet<Double>(); + for (Integer i : distanceMap.get(key)) { + Double norm = normalize(i, min, max); + //reverse the normed distance so low numbers (closer) are better + //this could be improved with a "decaying " function using an imcreaseing negative exponent + Double reverse = Math.abs(norm - 1); + normalizedDistances.add(reverse); + } + + List<Double> doubles = new ArrayList<Double>(normalizedDistances); + scoreMap.put(key, slidingDistanceAverage(doubles)); + } + return scoreMap; + } + + private boolean regexMatch(String placeName, String countryCode) { + if (regexMap.containsKey(countryCode)) { + String regexForCountry = regexMap.get(countryCode); + + Pattern p = Pattern.compile(regexForCountry,Pattern.DOTALL|Pattern.CASE_INSENSITIVE); + return p.matcher(placeName.trim()).matches(); + } + return false; + } + + /** + * this method is an attempt to make closer clusters of mentions group + * together to smooth out the average, so one distant outlier does not kill + * the score for an obviously good hit. More elegant solution is possible + * using Math.pow, and making the score decay with distance by using an + * increasing negative exponent (I think) + * + * @param normDis the normalized and sorted set of distances as a list + * @return + */ + private Double slidingDistanceAverage(List<Double> normDis) { + List<Double> windowOfAverages = new ArrayList<Double>(); + + if (normDis.size() < 3) { + windowOfAverages.addAll(normDis); + } else { + + for (int i = 0; i < normDis.size() - 1; i++) { + double a = normDis.get(i); + double b = normDis.get(i + 1); + windowOfAverages.add((a + b) / 2); + + } + } + double sum = 0d; + for (double d : windowOfAverages) { + sum += d; + } + double result = sum / windowOfAverages.size(); + //TODO: ++ prob when large amounts of mentions for a code + //System.out.println("avg of window:" + result); + return result; + } + + /** + * transposes a value within one range to a relative value in a different + * range. Used to normalize distances in this class. + * + * @param valueToNormalize the value to place within the new range + * @param minimum the min of the set to be transposed + * @param maximum the max of the set to be transposed + * @return + */ + private Double normalize(int valueToNormalize, int minimum, int maximum) { + Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0; + d = d == null ? 0d : d; + return d; + } +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java index e9634d9..abe5438 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java @@ -1,123 +1,125 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker.scoring; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import opennlp.addons.geoentitylinker.AdminBoundaryContext; -import opennlp.addons.geoentitylinker.GazetteerEntry; -import opennlp.tools.entitylinker.EntityLinkerProperties; -import opennlp.tools.entitylinker.BaseLink; -import opennlp.tools.entitylinker.LinkedSpan; -import opennlp.tools.util.Span; - -/** - * - * Generates scores based on string comparisons levenstein and dice - */ -public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryContext> { - - @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { - for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) { - for (BaseLink link : linkedSpan.getLinkedEntries()) { - if (link instanceof GazetteerEntry) { - GazetteerEntry entry = (GazetteerEntry) link; - String hierarchy = entry.getHierarchy(); - if (hierarchy != null) { - Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(), 2); - link.getScoreMap().put("hierarchydicecoef", dice); - Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase()); - link.getScoreMap().put("hierarchylevenshtein", ld); - } - String placename = entry.getItemName().toLowerCase(); - if (placename != null) { - Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), placename, 2); - link.getScoreMap().put("placenamedicecoef", dice); - - } - } - } - } - - } - - /** - * Generates a score based on an overlap of nGrams between two strings using - * the DiceCoefficient technique. - * - * @param s1 first string - * @param s2 second string - * @param nGrams number of chars in each gram - * @return - */ - public double getDiceCoefficient(String s1, String s2, int nGrams) { - if (s1.isEmpty() || s2.isEmpty()) { - return 0d; - } - List<String> s1Grams = new ArrayList<>(); - List<String> s2Grams = new ArrayList<>(); - String[] split1 = s1.split("[ ,]"); - for (String token : split1) { - if (token.trim().equals("")) { - continue; - } - s1Grams.add(token); - } - String[] split2 = s2.split("[ ,]"); - for (String token : split2) { - if (token.trim().equals("")) { - continue; - } - s2Grams.add(token); - } - - Set<String> overlap = new HashSet<String>(s1Grams); - overlap.retainAll(s2Grams); - double totcombigrams = overlap.size(); - - return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size()); - } - - private int minimum(int a, int b, int c) { - return Math.min(Math.min(a, b), c); - } - - public int getLevenshteinDistance(CharSequence str1, - CharSequence str2) { - int[][] distance = new int[str1.length() + 1][str2.length() + 1]; - - for (int i = 0; i <= str1.length(); i++) { - distance[i][0] = i; - } - for (int j = 1; j <= str2.length(); j++) { - distance[0][j] = j; - } - - for (int i = 1; i <= str1.length(); i++) { - for (int j = 1; j <= str2.length(); j++) { - distance[i][j] = minimum( - distance[i - 1][j] + 1, - distance[i][j - 1] + 1, - distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1)); - } - } - - return distance[str1.length()][str2.length()]; - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker.scoring; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import opennlp.addons.geoentitylinker.AdminBoundaryContext; +import opennlp.addons.geoentitylinker.GazetteerEntry; +import opennlp.tools.entitylinker.EntityLinkerProperties; +import opennlp.tools.entitylinker.BaseLink; +import opennlp.tools.entitylinker.LinkedSpan; +import opennlp.tools.util.Span; + +/** + * + * Generates scores based on string comparisons levenstein and dice + */ +public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryContext> { + + @Override + public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) { + for (BaseLink link : linkedSpan.getLinkedEntries()) { + if (link instanceof GazetteerEntry) { + GazetteerEntry entry = (GazetteerEntry) link; + String hierarchy = entry.getHierarchy(); + if (hierarchy != null) { + Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(), 2); + link.getScoreMap().put("hierarchydicecoef", dice); + Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase()); + link.getScoreMap().put("hierarchylevenshtein", ld); + } + String placename = entry.getItemName().toLowerCase(); + if (placename != null) { + Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), placename, 2); + link.getScoreMap().put("placenamedicecoef", dice); + + } + } + } + } + + } + + /** + * Generates a score based on an overlap of nGrams between two strings using + * the DiceCoefficient technique. + * + * @param s1 first string + * @param s2 second string + * @param nGrams number of chars in each gram + * @return + */ + public double getDiceCoefficient(String s1, String s2, int nGrams) { + if (s1.isEmpty() || s2.isEmpty()) { + return 0d; + } + List<String> s1Grams = new ArrayList<>(); + List<String> s2Grams = new ArrayList<>(); + String[] split1 = s1.split("[ ,]"); + for (String token : split1) { + if (token.trim().equals("")) { + continue; + } + s1Grams.add(token); + } + String[] split2 = s2.split("[ ,]"); + for (String token : split2) { + if (token.trim().equals("")) { + continue; + } + s2Grams.add(token); + } + + Set<String> overlap = new HashSet<String>(s1Grams); + overlap.retainAll(s2Grams); + double totcombigrams = overlap.size(); + + return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size()); + } + + private int minimum(int a, int b, int c) { + return Math.min(Math.min(a, b), c); + } + + public int getLevenshteinDistance(CharSequence str1, + CharSequence str2) { + int[][] distance = new int[str1.length() + 1][str2.length() + 1]; + + for (int i = 0; i <= str1.length(); i++) { + distance[i][0] = i; + } + for (int j = 1; j <= str2.length(); j++) { + distance[0][j] = j; + } + + for (int i = 1; i <= str1.length(); i++) { + for (int j = 1; j <= str2.length(); j++) { + distance[i][j] = minimum( + distance[i - 1][j] + 1, + distance[i][j - 1] + 1, + distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1)); + } + } + + return distance[str1.length()][str2.length()]; + } +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java index d3494e0..98bad74 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java @@ -1,62 +1,64 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker.scoring; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import opennlp.addons.geoentitylinker.AdminBoundaryContext; -import opennlp.addons.geoentitylinker.GazetteerEntry; -import opennlp.tools.entitylinker.EntityLinkerProperties; -import opennlp.tools.entitylinker.BaseLink; -import opennlp.tools.entitylinker.LinkedSpan; -import opennlp.tools.util.Span; - -/** - * Scores toponymns based on geographic point binning. Based on the heuristic - * that docs are generally about a small amount of locations, so one can detect - * outliers by finding those points that are not near the majority - * - */ -public class GeoHashBinningScorer implements LinkedEntityScorer<AdminBoundaryContext> { - - private final PointClustering CLUSTERER = new PointClustering(); - private int PRECISION = 3; - - @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { - //Map<Double, Double> latLongs = new HashMap<Double, Double>(); - List<GazetteerEntry> allGazEntries = new ArrayList<>(); - - /** - * collect all the gaz entry references - */ - for (LinkedSpan<BaseLink> ls : linkedSpans) { - for (BaseLink bl : ls.getLinkedEntries()) { - if (bl instanceof GazetteerEntry) { - allGazEntries.add((GazetteerEntry) bl); - } - } - } - /** - * use the point clustering to score each hit - */ - Map<String, List<GazetteerEntry>> cluster = CLUSTERER.cluster(allGazEntries, PRECISION); - CLUSTERER.scoreClusters(cluster); - - } - -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker.scoring; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import opennlp.addons.geoentitylinker.AdminBoundaryContext; +import opennlp.addons.geoentitylinker.GazetteerEntry; +import opennlp.tools.entitylinker.EntityLinkerProperties; +import opennlp.tools.entitylinker.BaseLink; +import opennlp.tools.entitylinker.LinkedSpan; +import opennlp.tools.util.Span; + +/** + * Scores toponymns based on geographic point binning. Based on the heuristic + * that docs are generally about a small amount of locations, so one can detect + * outliers by finding those points that are not near the majority + * + */ +public class GeoHashBinningScorer implements LinkedEntityScorer<AdminBoundaryContext> { + + private final PointClustering CLUSTERER = new PointClustering(); + private int PRECISION = 3; + + @Override + public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + //Map<Double, Double> latLongs = new HashMap<Double, Double>(); + List<GazetteerEntry> allGazEntries = new ArrayList<>(); + + /** + * collect all the gaz entry references + */ + for (LinkedSpan<BaseLink> ls : linkedSpans) { + for (BaseLink bl : ls.getLinkedEntries()) { + if (bl instanceof GazetteerEntry) { + allGazEntries.add((GazetteerEntry) bl); + } + } + } + /** + * use the point clustering to score each hit + */ + Map<String, List<GazetteerEntry>> cluster = CLUSTERER.cluster(allGazEntries, PRECISION); + CLUSTERER.scoreClusters(cluster); + + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java index 5fb9c5d..843d9b8 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java @@ -1,40 +1,42 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker.scoring; - -import java.util.List; -import opennlp.tools.entitylinker.EntityLinkerProperties; -import opennlp.tools.entitylinker.LinkedSpan; -import opennlp.tools.util.Span; - -/** - * Structure for scoring linked entities. The Map logically represents a pair : - * "Score type" to the "actual Score." - * @param <T> a generic for providing additional context - */ -public interface LinkedEntityScorer<T> { - -/** - * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan - * this method internally affects the reference to linkedSpans that was passed in - * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored - * @param docText the full text of the document. - * @param sentenceSpans the sentence spans the correspond to the document text - * @param properties the entitylinker properties config file - * @param additionalContext any additional data required to perform the scoring operation - */ - void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext); -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker.scoring; + +import java.util.List; +import opennlp.tools.entitylinker.EntityLinkerProperties; +import opennlp.tools.entitylinker.LinkedSpan; +import opennlp.tools.util.Span; + +/** + * Structure for scoring linked entities. The Map logically represents a pair : + * "Score type" to the "actual Score." + * @param <T> a generic for providing additional context + */ +public interface LinkedEntityScorer<T> { + +/** + * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan + * this method internally affects the reference to linkedSpans that was passed in + * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored + * @param docText the full text of the document. + * @param sentenceSpans the sentence spans the correspond to the document text + * @param properties the entitylinker properties config file + * @param additionalContext any additional data required to perform the scoring operation + */ + void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext); +} http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java ---------------------------------------------------------------------- diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java index 01b3269..034c526 100644 --- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java +++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java @@ -1,160 +1,163 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.addons.geoentitylinker.scoring; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import opennlp.addons.geoentitylinker.AdminBoundaryContext; -import opennlp.tools.doccat.DoccatModel; -import opennlp.tools.doccat.DocumentCategorizerME; -import opennlp.tools.entitylinker.EntityLinkerProperties; -import opennlp.tools.entitylinker.BaseLink; -import opennlp.tools.entitylinker.LinkedSpan; -import opennlp.tools.util.Span; -import org.apache.log4j.Logger; - -/** - * - * Utilizes a doccat model to score toponyms based on surrounding context - */ -public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext> { - - private static final Logger LOGGER = Logger.getLogger(ModelBasedScorer.class); - DocumentCategorizerME documentCategorizerME; - DoccatModel doccatModel; - public static final int RADIUS = 200; - boolean modelexists = false; - - @Override - public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { - try { - if (doccatModel == null) { - String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", ""); - if (path.equals("")) { - return; - } - modelexists = true; - doccatModel = new DoccatModel(new File(path)); - documentCategorizerME = new DocumentCategorizerME(doccatModel); - } - Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS); - for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) { - Map<String, Double> scores = this.getScore(entry.getValue()); - for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) { - double score = 0d; - if (scores.containsKey(link.getItemParentID())) { - score = scores.get(link.getItemParentID()); - } - link.getScoreMap().put("countrymodel", score); - } - } - - } catch (FileNotFoundException ex) { - LOGGER.error(ex); - } catch (IOException ex) { - LOGGER.error(ex); - } catch (Exception ex) { - LOGGER.error(ex); - } - } - - /** - * generates features using a BagOfWordsfeatureGenerator that are within the - * radius of a mention within the doctext - * - * @param linkedSpans - * @param sentenceSpans - * @param docText - * @param radius - * @return a map of the index of the linked span to the string of surrounding - * text: Map<indexofspan,surrounding text> - */ - public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) { - Map<Integer, String> featureBags = new HashMap<>(); - Map<Integer, Integer> nameMentionMap = new HashMap<>(); - /** - * iterator over the map that contains a mapping of every country code to - * all of its mentions in the document - */ - for (int i = 0; i < linkedSpans.size(); i++) { - LinkedSpan span = linkedSpans.get(i); - if (span.getLinkedEntries().isEmpty()) { - //don't care about spans that did not get linked to anything at all; nothing to work with - continue; - } - /** - * get the sentence the name span was found in, the beginning of the - * sentence will suffice as a centroid for feature generation around the - * named entity - */ - Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart(); - nameMentionMap.put(i, mentionIdx); - } - /** - * now associate each span to a string that will be used for categorization - * against the model. - */ - for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) { - featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius)); - } - - return featureBags; - } - - public String getTextChunk(int mentionIdx, String docText, int radius) { - int docSize = docText.length(); - int left = 0, right = 0; - left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius; - right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius; - String chunk = ""; - if (right <= left) { - chunk = ""; - } else { - /** - * don't want to chop any words in half, so take fron the first space to - * the last space in the chunk string - */ - chunk = docText.substring(left, right); - if (left != 0) { - left = chunk.indexOf(" "); - } - right = chunk.lastIndexOf(" "); - /** - * now get the substring again with only whole words - */ - if (left < right) { - chunk = chunk.substring(left, right); - } - } - - return chunk; - } - - private Map<String, Double> getScore(String text) throws Exception { - Map<String, Double> scoreMap = new HashMap<>(); - double[] categorize = documentCategorizerME.categorize(text); - int catSize = documentCategorizerME.getNumberOfCategories(); - for (int i = 0; i < catSize; i++) { - String category = documentCategorizerME.getCategory(i); - scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]); - } - return scoreMap; - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.addons.geoentitylinker.scoring; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.log4j.Logger; + +import opennlp.addons.geoentitylinker.AdminBoundaryContext; +import opennlp.tools.doccat.DoccatModel; +import opennlp.tools.doccat.DocumentCategorizerME; +import opennlp.tools.entitylinker.BaseLink; +import opennlp.tools.entitylinker.EntityLinkerProperties; +import opennlp.tools.entitylinker.LinkedSpan; +import opennlp.tools.util.Span; + +/** + * Utilizes a doccat model to score toponyms based on surrounding context + */ +public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext> { + + private static final Logger LOGGER = Logger.getLogger(ModelBasedScorer.class); + DocumentCategorizerME documentCategorizerME; + DoccatModel doccatModel; + public static final int RADIUS = 200; + boolean modelexists = false; + + @Override + public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { + try { + if (doccatModel == null) { + String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", ""); + if (path.equals("")) { + return; + } + modelexists = true; + doccatModel = new DoccatModel(new File(path)); + documentCategorizerME = new DocumentCategorizerME(doccatModel); + } + Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS); + for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) { + Map<String, Double> scores = this.getScore(entry.getValue()); + for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) { + double score = 0d; + if (scores.containsKey(link.getItemParentID())) { + score = scores.get(link.getItemParentID()); + } + link.getScoreMap().put("countrymodel", score); + } + } + + } catch (FileNotFoundException ex) { + LOGGER.error(ex); + } catch (IOException ex) { + LOGGER.error(ex); + } catch (Exception ex) { + LOGGER.error(ex); + } + } + + /** + * generates features using a BagOfWordsfeatureGenerator that are within the + * radius of a mention within the doctext + * + * @param linkedSpans + * @param sentenceSpans + * @param docText + * @param radius + * @return a map of the index of the linked span to the string of surrounding + * text: Map<indexofspan,surrounding text> + */ + public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) { + Map<Integer, String> featureBags = new HashMap<>(); + Map<Integer, Integer> nameMentionMap = new HashMap<>(); + /** + * iterator over the map that contains a mapping of every country code to + * all of its mentions in the document + */ + for (int i = 0; i < linkedSpans.size(); i++) { + LinkedSpan span = linkedSpans.get(i); + if (span.getLinkedEntries().isEmpty()) { + //don't care about spans that did not get linked to anything at all; nothing to work with + continue; + } + /** + * get the sentence the name span was found in, the beginning of the + * sentence will suffice as a centroid for feature generation around the + * named entity + */ + Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart(); + nameMentionMap.put(i, mentionIdx); + } + /** + * now associate each span to a string that will be used for categorization + * against the model. + */ + for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) { + featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius)); + } + + return featureBags; + } + + public String getTextChunk(int mentionIdx, String docText, int radius) { + int docSize = docText.length(); + int left = 0, right = 0; + left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius; + right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius; + String chunk = ""; + if (right <= left) { + chunk = ""; + } else { + /** + * don't want to chop any words in half, so take fron the first space to + * the last space in the chunk string + */ + chunk = docText.substring(left, right); + if (left != 0) { + left = chunk.indexOf(" "); + } + right = chunk.lastIndexOf(" "); + /** + * now get the substring again with only whole words + */ + if (left < right) { + chunk = chunk.substring(left, right); + } + } + + return chunk; + } + + private Map<String, Double> getScore(String text) throws Exception { + Map<String, Double> scoreMap = new HashMap<>(); + double[] categorize = documentCategorizerME.categorize(text); + int catSize = documentCategorizerME.getNumberOfCategories(); + for (int i = 0; i < catSize; i++) { + String category = documentCategorizerME.getCategory(i); + scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]); + } + return scoreMap; + } +}
