http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java new file mode 100755 index 0000000..c8156ea --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java @@ -0,0 +1,739 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.apps.object_dedup; + +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + +import opennlp.tools.similarity.apps.BingQueryRunner; +import opennlp.tools.similarity.apps.HitBase; +import opennlp.tools.similarity.apps.utils.LevensteinDistanceFinder; +import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; +import opennlp.tools.similarity.apps.utils.Utils; +import opennlp.tools.textsimilarity.TextProcessor; + +import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/* This is a template class for deduplicator */ + +public class SimilarityAccessorBase +{ + private static final Logger LOG = LoggerFactory.getLogger(SimilarityAccessorBase.class); + + public static final int MAX_EV_TO_RECOMM = 6; + + private List<String> namesBothSides; + + protected static final String[] englishPrepositions = new String[] { "a", "aboard", "about", "above", "absent", + "across", "after", "against", "along", "alongside", "among", "around", "as", "at", "before", "behind", "below", + "beneath", "between", "beyond", "but", "by", "despite", "down", "during", "except", "excluding", "failing", + "following", "for", "from", "in", "including", "inside", "into", "like", "near", "next", "of", "off", "on", + "onto", "only", "opposite", "out", "outside", "over", "pace", "past", "per", "since", "than", "through", "and", + "thru", "till", "to", "toward", "under", "up", "upon", "versus", "with", "within", "you", "must", "know", + "when" }; + + protected List<String> commonWordsInEventTitles = Arrays.asList(new String[] { "community", "party", "film", + "music", "exhibition", "kareoke", "guitar", "quartet", "reggae", "r&b", "band", "dj ", "piano", "pray", + "worship", "god", "training", "class", "development", "training", "class", "course", "our", "comedy", ",fun", + "musical", "group", "alliance", "session", "feeding", "introduction", "school", "conversation", "learning", + "nursery", "unity", "trivia", "chat", "conference", "tuition", "technology", "teen", "communication", + "reception", "management", "beginner", "beginning", "collabora", "reuninon", "political", "course", "age", + "ages", "through", "grade", "networking", "workshop", "demonstration", "tuning", "program", "summit", + "convention", "day", "night", "one", "two", "outfest", "three", "online", "writing", "seminar", "coach", + ",expo", "advanced", "beginner", "intermediate", "earn", "free", "ii", "iii", "skills", "skill", "artist", + "summer", "winter", "autumn", "spring", "camp", "vacation", "miscrosoft", "kid", "child", "kids", "children", + "every", "everyone", "dancer", "dancers", "senior", "seniors", "basic", "elementary", "outfest", "2008", + "2009", "2010", "2011", "2012", "monday", "tuesday", "wednesday", "thirsday", "friday", "saturday", "sunday", + "mondays", "tuesdays", "wednesdays", "thirsdays", "fridays", "saturdays", "sundays", "men" // ? + }); + + private BingQueryRunner webSearch = new BingQueryRunner(); + + private StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer(); + + + public SimilarityAccessorBase() + { + } + + + public void init() + { + namesBothSides = getWordsThatShouldBeOnBothSidesEvents(); + } + + protected List<String> removeDollarWordAndNonAlphaFromList(List<String> list) + { + List<String> result = new ArrayList<String>(); + Pattern p = Pattern.compile("^\\$(\\d{1,3}(\\,\\d{3})*|(\\d+))(\\.\\d{2})?$"); + for (String w : list) + { + if (!(p.matcher(w).find()) && StringUtils.isAlphanumeric(w) && (w.length() >= 3 || !StringUtils.isAlpha(w))) + result.add(w); + } + return result; + } + + + public List<String> getWordsThatShouldBeOnBothSidesEvents() + { +/* + names.addAll(Arrays.asList(new String[] { "woman", "man", "women", "men", "womans", "mans", "womens", "mens", + "boy", "girl", "boys", "girls", "men's", "women's", "woman's", "ice", // for disney + "flight", "intermediate", "advanced", "beginner", + // "tour", TODO special consideration + "helicopter", "sexual", "junior", "jr" })); + */ + return null; + + } + + protected Boolean applySemanticNameSimilarityRule(Object es1, + Object es2) + { + + //TODO check attributes of objects + /* + if (!(es1.getVenueName().endsWith(es2.getVenueName()) || es2.getVenueName().endsWith(es1.getVenueName()))) + return false; + if (Math.abs(es1.getStarttime().getTime() - es2.getStarttime().getTime()) > 100000) + return false; + */ + + return true; + + } + + // this rule extract "OF" part and treats it as a whole expression + protected void applySubPhraseExtractionRule(List<String> name1Tokens, List<String> name2Tokens) + { + if (name1Tokens.indexOf("of") > 0 && name2Tokens.indexOf("of") > 0) + { + name1Tokens = extractMainNounPhrase(name1Tokens); + name2Tokens = extractMainNounPhrase(name2Tokens); + } + } + + private Boolean attemptShortTitlesSimilarityInWebSpace(String name1, String name2) + { + + // first delimeter processing + String name1v = name1.replace("'", "").replace("-", " "); + String name2v = name2.replace("'", "").replace("-", " "); + String name1vv = name1.replace("'", ""); + String name2vv = name2.replace("'", ""); + String name1vvv = name1.replace("-", " "); + String name2vvv = name2.replace("-", " "); + + if (name1.startsWith(name2) || name1vv.startsWith(name2) || name1.startsWith(name2v) + || name1.startsWith(name2vv) || name1.startsWith(name2vvv) || name1v.startsWith(name2v) + || name1v.startsWith(name2vv) || name2.startsWith(name1) || name2vv.startsWith(name1) + || name2.startsWith(name1v) || name2vvv.startsWith(name1vv) || name2.startsWith(name1vvv) + || name2v.startsWith(name1v) || name2v.startsWith(name1vv) || name1.endsWith(name2) + || name1vv.endsWith(name2) || name1.endsWith(name2v) || name1.endsWith(name2vv) || name1.endsWith(name2vvv) + || name1v.endsWith(name2v) || name1v.endsWith(name2vv) || name2.endsWith(name1) || name2vv.endsWith(name1) + || name2.endsWith(name1v) || name1vvv.endsWith(name2vv) || name2.endsWith(name1vvv) + || name2v.endsWith(name1v) || name2v.endsWith(name1vv)) + { + LOG.info("Found fuzzy substring of name1 and name2"); + return true; + } + if (name1.length() > 12 && name2.length() > 12) + return false; + + return areNamesSemanticallyCloseInWebSearchSpace(name1, name2, 0.8f, false).isDecision(); + + } + + public Boolean applyBothSidesRuleEvent(String name1, String name2) + { + List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); + List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); + // get unique names + List<String> name1TokensC = new ArrayList<String>(name1Tokens), name2TokensC = new ArrayList<String>( + name2Tokens); + ; + name1TokensC.removeAll(name2Tokens); + name2TokensC.removeAll(name1Tokens); + // get all unique names + name1TokensC.addAll(name2TokensC); + + name1TokensC.retainAll(namesBothSides); + name1Tokens.retainAll(name2Tokens); + + if ((name1TokensC.size() > 0 && name1Tokens.size() < 3) || (name1TokensC.size() > 1 && name1Tokens.size() < 5)) + { // 'mens == men; case !(name1TokensC.size()==2 && (name1TokensC.get(0).indexOf(name1TokensC.get(1))>-1 || + // name1TokensC.get(1).indexOf(name1TokensC.get(0))>-1 ))){ + LOG.info("Found required common word present on one side and not on the other: " + name1TokensC.toString() + + " and less than 3 keywords overlap (or >1 common words and less than 5 overl"); + return false; + } + else + return true; + } + + protected List<String> tokenizeAndStem(String input) + { + + List<String> results = new ArrayList<String>(); + List<String> toks = TextProcessor.fastTokenize(input.toLowerCase(), false); + for (String word : toks) + { + try + { + if (word.equals("theatre")) + word = "theater"; + results.add(word); + } + catch (Exception e) + { + results.add(word); + } + } + return results; + } + + protected List<String> stemList(List<String> toks) + { + + List<String> results = new ArrayList<String>(); + for (String word : toks) + { + try + { + if (word.equals("theatre")) + word = "theater"; + results.add(word); + } + catch (Exception e) + { + results.add(word); + } + } + return results; + } + + public List<String> removeVenuePart(ArrayList<String> toks) + { + List<String> results = new ArrayList<String>(); + boolean bVenuePart = false; + for (String word : toks) + { + // beginning of venue part + if (word.equals("at") || word.equals("@")) + bVenuePart = true; + // end of venue part + if (!StringUtils.isAlphanumeric(word) || word.startsWith("<punc")) + bVenuePart = false; + + if (!bVenuePart && !word.startsWith("<punc")) + results.add(word); + + } + return results; + } + + protected boolean isCapitalized(String lookup) + { + String[] titleWords = lookup.split(" "); + int count = 0; + for (String word : titleWords) + { + if (word.length() < 2) // '-', '|', ':' + break; + + if (word.equals(word.toLowerCase()) && (!Arrays.asList(englishPrepositions).contains(word)) + && word.length() > 3 && StringUtils.isAlphanumeric(word)) + continue; // was return false; + if (count > 3) + break; + count++; + } + return true; + } + + protected List<String> extractMainNounPhrase(List<String> name1Tokens) + { + List<String> results = new ArrayList<String>(); + int ofPos = name1Tokens.indexOf("of"); + List<String> ofList = name1Tokens.subList(ofPos + 1, name1Tokens.size() - 1); + // now iterate till next preposition towards the end of noun phrase + for (String preposCand : ofList) + { + if (Arrays.asList(englishPrepositions).contains(preposCand)) + break; + results.add(preposCand); + } + return results; + + } + + public boolean verifyEventAttributesPost(List<String> name1Tokens, List<String> name2Tokens) + { + String[] attributeNamesPost = { "age", "ages", "game", "games", "grade", "grades", "level", "levels", "vs", + "vs.", "versus", "pottery", "competition", "contest", "skill", "skills", "day", "only", "basic", "class", + "completed", + // "tour", ? + "advanced", "beginner", "intermediate", "flight", "workshop", "latin", "adobe", "ballet", "dinner", + "breakfast", "lunch", "summer", // "canyon" + "tfestival", "festival", "mfestival" }; + try + { + for (String attr : attributeNamesPost) + { + + int agePos1 = name1Tokens.indexOf(attr); + int agePos2 = name2Tokens.indexOf(attr); + if (agePos1 > -1 && agePos2 > -1 && agePos1 < name1Tokens.size() - 1 + && agePos2 < name2Tokens.size() - 1) + { + double dist = LevensteinDistanceFinder.levensteinDistance(name1Tokens.get(agePos1 + 1), + name2Tokens.get(agePos2 + 1), 1, 10, 1, 10); + if (!name1Tokens.get(agePos1 + 1).equalsIgnoreCase(name2Tokens.get(agePos2 + 1)) + && (dist > 2.99 || name1Tokens.get(agePos1 + 1).length() < 4)) + { + LOG.info("Found disagreement in the attrib value for " + attr + " value = " + + name1Tokens.get(agePos1 + 1) + " <=> " + name2Tokens.get(agePos2 + 1)); + return false; + } + } + } + } + catch (Exception e) + { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return true; + } + + public boolean verifyEventAttributesPre(List<String> name1Tokens, List<String> name2Tokens) + { + + String[] attributeNamesPre = { "hour", "vs", "vs.", "versus", "pottery", "program", "day", "only", + // dance styles followed by a param + "swing", "rumba", "samba", "doble", + "violence", // + // "level", + "class", "classes", "kid", "kids", "test", "west", "summer_camp", "session", "tfestival", "festival", + "mfestival" }; + try + { + for (String attr : attributeNamesPre) + { + int agePos1 = name1Tokens.indexOf(attr); + int agePos2 = name2Tokens.indexOf(attr); + if (agePos1 > 0 && agePos2 > 0) + { // not the first word is attr name + if (!name1Tokens.get(agePos1 - 1).equalsIgnoreCase(name2Tokens.get(agePos2 - 1)) + && (agePos1 < 2 || !name1Tokens.get(agePos1 - 2).equalsIgnoreCase(name2Tokens.get(agePos2 - 1))) + && + // ((agePos1<2 && agePos2 <2) || !name1Tokens.get(agePos1 - + // 2).equalsIgnoreCase(name2Tokens.get(agePos2 - 2 ))) && + (agePos2 < 2 || !name1Tokens.get(agePos1 - 1).equalsIgnoreCase(name2Tokens.get(agePos2 - 2))) + + ) + { + LOG.info("Found disagreement in the attrib value for " + attr + " value = " + + name1Tokens.get(agePos1 - 1) + " and " + name2Tokens.get(agePos2 - 1)); + return false; + } + } + } + } + catch (Exception e) + { + e.printStackTrace(); + } + return true; + } + + protected boolean bDifferentGroupOneSubnameOfAnother(String name1, String name2) + { + // first check a special case that both name1 and name2 are DIFFERENT groups at last.fm + Map<String, Integer> map1 = null; //LastFM_APIManager.extractTagsForArtist(name1); + Map<String, Integer> map2 = null; //LastFM_APIManager.extractTagsForArtist(name2); + if (map1 != null && map2 != null && map1.size() > 0 && map2.size() > 0) + map1.entrySet().removeAll(map2.entrySet()); + if (map1.size() > 0) // same or subset of tags => different groups + return true; + + return false; + } + + public boolean applyBothSidesRule(String name1, String name2) + { + List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); + List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); + // get unique names + List<String> name1TokensC = new ArrayList<String>(name1Tokens), name2TokensC = new ArrayList<String>( + name2Tokens); + ; + name1TokensC.removeAll(name2Tokens); + name2TokensC.removeAll(name1Tokens); + // get all unique names + name1TokensC.addAll(name2TokensC); + + name1TokensC.retainAll(namesBothSides); + if (name1TokensC.size() > 0) + return false; + else + return true; + } + + private boolean succeededMenWomenSportsRule(String name1, String name2) + { + List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); + List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); + if (name1Tokens.contains("men") || name2Tokens.contains("men") || name1Tokens.contains("women") + || name2Tokens.contains("women") || name1Tokens.contains("disney") || name2Tokens.contains("disney")) + { // all words should be the + // same + name1Tokens.removeAll(name2Tokens); + name1Tokens.removeAll(Arrays.asList(englishPrepositions)); + name1Tokens.removeAll(Arrays.asList(commonWordsInEventTitles)); + if (name1Tokens.size() < 1) + return true; + + return false; + } + else + return true; + + } + + private boolean succeededSpecialGroupsSymphoniesRule(String name1, String name2) + { + List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); + List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); + if (name1Tokens.contains("orchestra") || name2Tokens.contains("symphony") || name2Tokens.contains("orchestra") + || name1Tokens.contains("symphony") || name2Tokens.contains("band") || name1Tokens.contains("band") + || name2Tokens.contains("trio") || name1Tokens.contains("trio") || name1Tokens.contains("soleil") + || name2Tokens.contains("soleil") || name1Tokens.contains("disney") || name2Tokens.contains("disney") + || name1Tokens.contains("lang") || name2Tokens.contains("lang")) // special group 'lang lang' + { // all words should be the + // same + List<String> name1TokensClone = new ArrayList<String>(name1Tokens); + name1Tokens.removeAll(name2Tokens); + name2Tokens.removeAll(name1TokensClone); + name1Tokens.addAll(name2Tokens); + name1Tokens.removeAll(Arrays.asList(this.englishPrepositions)); + // name1Tokens.removeAll(Arrays.asList(this.commonWordsInEventTitles)); + if (name1Tokens.size() < 1) + return true; + + return false; + } + else + return true; + + } + + public int getAttemptedNameMerge(String name1, String name2) + { + name1 = name1.replaceAll("[a-z][A-Z]", "$0&$0").replaceAll(".&.", " "); + ; // suspected word merge if higher case is in the middle of word + name2 = name2.replaceAll("[a-z][A-Z]", "$0&$0").replaceAll(".&.", " "); + + name1 = name1.toLowerCase(); + name2 = name2.toLowerCase(); + if (name1.equals(name2) || name1.startsWith(name2) || name2.startsWith(name1) || name1.endsWith(name2) + || name1.endsWith(name2) || name1.indexOf(name2) > -1 || name1.indexOf(name2) > -1) // ?? + return 2; + String name2r = name2.replace(" ", ""); + if (name1.equals(name2r) || name1.startsWith(name2r) || name1.startsWith(name2r) || name1.endsWith(name2r) + || name1.endsWith(name2r)) + return 1; + String name1r = name1.replace(" ", ""); + if (name1r.equals(name2r) || name1r.startsWith(name2r) || name1r.startsWith(name2) || name1r.endsWith(name2r) + || name1r.endsWith(name2r) || name2r.equals(name1r) || name2r.startsWith(name1r) + || name2r.startsWith(name1) || name2r.endsWith(name1r) || name2r.endsWith(name2) + + ) + return 1; + + if (stringDistanceMeasurer.measureStringDistance(name1, name2) > 0.95) + return 2; + if (stringDistanceMeasurer.measureStringDistance(name1, name2) > 0.70) + return 1; + return 0; + } + + private String normalizeGenderAndOtherAttributes(String name1) + { + name1 = Utils.convertToASCII(name1.replace("/", " ").replace("w/", "with ")).replace('!', ' ').toLowerCase(); + + name1 = name1.replace("woman", "women").replace("womans", "women").replace("womens", "women") + .replace("women's", "women").replace("woman's", "women"); + name1 = name1.replace(" man ", " men ").replace(" mans ", " men ").replace(" men's ", " men ") + .replace(" man's ", " men ").replace(" mens ", " men ").replace("summer camp", "summer_camp") + .replace("gaea theatre festival", "tfestival"); // need regexp for this + return name1; + } + + /* + * Main semantic similarity function which applies boundary cases rule and focus on web mining rule The main + * criteria for a commonality between titles: to form an entity, searchable on the web + */ + public DedupResult areNamesSemanticallyCloseWebMineCommonPart(String name1, String name2, String venue) + { + // normalize gender + name1 = normalizeGenderAndOtherAttributes(name1); + name2 = normalizeGenderAndOtherAttributes(name2); + + Boolean bShortTitlesSimilarInWebSpace = attemptShortTitlesSimilarityInWebSpace(name1, name2); + if (bShortTitlesSimilarInWebSpace) + return new DedupResult("Accepted as short title by web mining", 2, true); + + StringBuffer reason = new StringBuffer(); + List<String> venueToks = removeVenuePart(TextProcessor.fastTokenize(venue.toLowerCase(), false)); + + LOG.info("\nComputing similarity between name = '" + name1 + "' and name = '" + name2 + "'"); + // convert titles into token lists + List<String> name1Tokens = removeVenuePart(TextProcessor.fastTokenize(name1.toLowerCase(), true)); + List<String> name2Tokens = removeVenuePart(TextProcessor.fastTokenize(name2.toLowerCase(), true)); + // applySubPhraseExtractionRule() + Boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, name2Tokens) + && verifyEventAttributesPre(name1Tokens, name2Tokens); + if (!bSameAttrib) + { + LOG.info("similar events but different attributes"); + return new DedupResult("similar events but different attributes", 0, false); + } + + boolean bothSodesSuccess = applyBothSidesRuleEvent(name1, name2); + if (!bothSodesSuccess) + { + return new DedupResult("Failed common words test for sports", 0, false); + } + + float dist = (float) LevensteinDistanceFinder.levensteinDistance(name1, name2, 1, 10, 1, 10); + if (dist < 5.1) + { + LOG.info("Found low LevensteinDistance for name1 and name2"); + return new DedupResult("Found low LevensteinDistance", 2, true); + } + + int nameMergeScore = getAttemptedNameMerge(name1, name2); + if (nameMergeScore > 0) + { + LOG.info("Found low NameMerge Distance for name1 and name2"); + return new DedupResult("Found low NameMerge Distance", 2, true); + } + + // todo take into account order + // form common sub-list of tokens + name1Tokens.retainAll(name2Tokens); + name1Tokens.removeAll(venueToks); + + name1Tokens.removeAll(commonWordsInEventTitles); + name1Tokens.removeAll(Arrays.asList(englishPrepositions)); + name1Tokens = removeDollarWordAndNonAlphaFromList(name1Tokens); + // todo : to use full string measure + // boundary case: too many words => just do counts + float commonPortion = (float) name1Tokens.size() / (float) name2Tokens.size(); + if (commonPortion > 0.8 || name1Tokens.size() >= 4) + { // after typical + // title words + // are revomed 4 + // looks OK + LOG.info("Accepted since substantial common part"); + return new DedupResult("Accepted since substantial common part", Math.max((int) (commonPortion * 5.0), 2), + true); + } + // boundary case: no overlap + if (name1Tokens.size() < 1) + { + LOG.info("Rejected since nothing in common"); + return new DedupResult("Rejected since nothing in common", 0, false); + } + // get from list of tokens back to words to get search expression + String entityExpression = name1Tokens.toString().replace('[', ' ').replace(']', ' ').replace(',', ' ') + .replace(" ", " ").trim(); + /* + * // now try name merge reduced strings String entityExpression1 = name1TokensC.toString().replace('[', + * ' ').replace(']', ' ').replace(',', ' ') .replace(" ", " ").trim(); String entityExpression2 = + * name2Tokens.toString().replace('[', ' ').replace(']', ' ').replace(',', ' ') .replace(" ", " ").trim(); + * + * nameMergeScore = getAttemptedNameMerge(entityExpression1, entityExpression2); if (nameMergeScore>0){ + * LOG.info("Found low NameMerge Distance for REDUCED name1 and name2"); return new + * DedupResult("Found low NameMerge Distance REDUCED", 2, true); + * + * } + */ + + // Before doing web mining, make sure overlap between titles is NOT a + // set of common english words (use the vocabulary) + // if all words are common, then NOT an entity + if (name1Tokens.size() < 2) + { + boolean bCommonEnglishWord = false; + for (String word : name1Tokens) + { + // if (stopList.isCommonWord(word) /*&& mostFrequent1000Words.isMostFrequent1000Word(word)*/) + // bCommonEnglishWord = true; + } + + if (bCommonEnglishWord) + { + LOG.info("Rejected common entity: common word = " + entityExpression); + return new DedupResult("Rejected since common entity is common English word = " + entityExpression, 0, + false); + } + } + // accept common expression + LOG.info("Formed common entity = " + entityExpression); + reason.append("Formed common entity = " + entityExpression + "\n"); + // now go to the web / bing api with this common expression + List<HitBase> searchResult = webSearch.runSearch(entityExpression); + float entityScore = 0f; + if (searchResult != null) + { + int count = 0; + for (HitBase item : searchResult) + { + String lookup = item.getTitle(); + LOG.info("Bing hit title = '" + lookup + "'"); + reason.append("Bing hit title = '" + lookup + "'\n"); + if (count > 4) + break; + count++; + // if occurrence is not capitalized then rejected, do not take + // into account in score + if (!isCapitalized(lookup)) + { + LOG.info("Rejected hit title since not capitalized"); + reason.append("Rejected hit title since not capitalized\n"); + continue; + } + + /* + * if (lookup.indexOf('-')>0 ){ lookup = lookup.split("-")[0]; } + */ + // now compute overlap between what found on the web for hit's + // title and the common expression between events + List<String> lookupTokens = tokenizeAndStem(lookup); + lookupTokens.retainAll(stemList(name1Tokens)); + if (lookupTokens.size() >= name1Tokens.size()) + // increment score if found hit title is acceptable + entityScore += 1.0; + else + { + LOG.info("Found hit title " + lookupTokens + " does not cover comonality expr = " + name1Tokens); + entityScore += 0.25; + + } + + } + } + return new DedupResult(reason.toString(), (int) entityScore, entityScore > 1.0); + } + + public DedupResult areNamesSemanticallyCloseInWebSearchSpace(String name1, String name2, Float thresh, boolean bStem) + { + + if (thresh == null || thresh == 0f) + { + thresh = 0.8f; + } + + // normalize gender + name1 = normalizeGenderAndOtherAttributes(name1); + name2 = normalizeGenderAndOtherAttributes(name2); + + StringBuffer reason = new StringBuffer(); + + boolean bSportsOrOrchestra = !succeededMenWomenSportsRule(name1, name2); + if (bSportsOrOrchestra) + return new DedupResult("Sports rule: different teams or teams of different venues", 0, false); + + bSportsOrOrchestra = !succeededSpecialGroupsSymphoniesRule(name1, name2); + if (bSportsOrOrchestra) + return new DedupResult("SpecialGroupsSymphoniesRule: different circus/band", 0, false); + + LOG.info("\nComputing similarity between name = '" + name1 + "' and name = '" + name2 + "'"); + + List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), true); + List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), true); + Boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, name2Tokens) + && verifyEventAttributesPre(name1Tokens, name2Tokens); + if (!bSameAttrib) + { + LOG.info("similar events but different attributes"); + return new DedupResult("similar events but different attributes", 0, false); + } + + List<HitBase> searchResult1 = webSearch.runSearch(name1); + List<HitBase> searchResult2 = webSearch.runSearch(name2); + int score = 0; + if (searchResult1 != null && searchResult2 != null) + { + for (HitBase item1 : searchResult1) + { + if (item1.getUrl().indexOf("myspace") > -1 || item1.getUrl().indexOf("wiki") > -1) + continue; + for (HitBase item2 : searchResult2) + { + String lookup1 = item1.getTitle().replace("Facebook", "").replace("LinkedIn", "") + .replace("MySpace", ""); + String lookup2 = item2.getTitle().replace("Facebook", "").replace("LinkedIn", "") + .replace("MySpace", ""); + double d = 0; + if (bStem) + d = stringDistanceMeasurer.measureStringDistance(lookup1, lookup2); + else + d = stringDistanceMeasurer.measureStringDistanceNoStemming(lookup1, lookup2); + if (d > thresh) // 0.8) + { + + reason.append("Found common search result title for group names '" + lookup1 + " < > " + + lookup2 + " sim = " + d + "\n"); + LOG.info(("Found common search result title for group names '" + lookup1 + " < > " + lookup2 + + " sim = " + d)); + score++; + } + + } + } + } + + Boolean bothSidesSuccess = applyBothSidesRule(name1, name2); + if (!bothSidesSuccess) + { + score = 1; + reason.append("Failed common words test for sports"); + } + if (score > 0) + { + Boolean bDifferentGroup = bDifferentGroupOneSubnameOfAnother(name1, name2); + if (bDifferentGroup) + { + score = 1; + reason.append("Failed common words test for sports"); + } + } + return new DedupResult(reason.toString(), score, score > 1); + } +}
http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier.zip ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier.zip b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier.zip new file mode 100644 index 0000000..2227504 Binary files /dev/null and b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier.zip differ http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/ClassifierTrainingSetIndexer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/ClassifierTrainingSetIndexer.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/ClassifierTrainingSetIndexer.java new file mode 100644 index 0000000..ad851e3 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/ClassifierTrainingSetIndexer.java @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.doc_classifier; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import opennlp.tools.jsmlearning.ProfileReaderWriter; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; +import org.apache.tika.Tika; + +public class ClassifierTrainingSetIndexer { + public static String resourceDir = new File(".").getAbsolutePath().replace("/.", "") + "/src/main/resources"; + public static String INDEX_PATH = "/classif", + CLASSIF_TRAINING_CORPUS_PATH = "/training_corpus"; + protected ArrayList<File> queue = new ArrayList<File>(); + Tika tika = new Tika(); + + IndexWriter indexWriter = null; + protected static String[] domains = new String[] { "legal", "health", + "computing", "engineering", "business" }; + private String absolutePathTrainingSet=null; + + public ClassifierTrainingSetIndexer() { + + try { + initIndexWriter(resourceDir); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public ClassifierTrainingSetIndexer(String absolutePathTrainingSet) { + this.absolutePathTrainingSet = absolutePathTrainingSet; + try { + initIndexWriter(resourceDir); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public void indexTrainingSet() { + + try { + if (absolutePathTrainingSet==null) + indexFileOrDirectory(resourceDir + + CLASSIF_TRAINING_CORPUS_PATH); + else + indexFileOrDirectory( + this.absolutePathTrainingSet); + + } catch (IOException e1) { + e1.printStackTrace(); + } + try { + indexWriter.commit(); + } catch (IOException e) { + e.printStackTrace(); + } + } +/* + private void indexTrainingSample(String text, String flag, int id) + throws IOException { + + Document doc = new Document(); + doc.add(new StringField("id", new Integer(id).toString(), + Field.Store.YES)); + doc.add(new TextField("text", text.toLowerCase(), Field.Store.YES)); + doc.add(new StringField("class", flag.toLowerCase(), Field.Store.YES)); + indexWriter.addDocument(doc); + + } +*/ + private void addFiles(File file) { + + if (!file.exists()) { + System.out.println(file + " does not exist."); + } + if (file.isDirectory()) { + for (File f : file.listFiles()) { + if (f.getName().startsWith(".")) + continue; + addFiles(f); + System.out.println(f.getName()); + } + } else { + queue.add(file); + + } + } + + // index last folder name, before filename itself + + public void indexFileOrDirectory(String fileName) throws IOException { + addFiles(new File(fileName)); + + List<File> files = new ArrayList<File>(queue); + for (File f : files) { + if (!f.getName().endsWith(".xml")) { + + try { + Document doc = new Document(); + + String name = f.getPath(); + String className = null; + for (String d : domains) { + if (name.indexOf(d) > -1) { + className = d; + break; + } + } + + try { + doc.add(new TextField("text", tika.parse(f))); + } catch (Exception e1) { + e1.printStackTrace(); + } + + doc.add(new StringField("path", f.getPath(), + Field.Store.YES)); + doc.add(new StringField("class", className, Field.Store.YES)); + try { + + indexWriter.addDocument(doc); + + } catch (Exception e) { + e.printStackTrace(); + System.out.println("Could not add: " + f); + } + } catch (Exception ee) { + ee.printStackTrace(); + } + } else { // for xml files + try { + Document doc = new Document(); + + String name = new String(f.getPath()); + String[] nparts = name.split("/"); + int len = nparts.length; + name = nparts[len - 2]; + + FileReader fr = new FileReader(f); + doc.add(new TextField("text", fr)); + + doc.add(new StringField("path", f.getPath(), + Field.Store.YES)); + doc.add(new StringField("class", name, Field.Store.YES)); + try { + + indexWriter.addDocument(doc); + + } catch (Exception e) { + e.printStackTrace(); + System.out.println("Could not add: " + f); + } finally { + fr.close(); + } + } catch (Exception ee) { + ee.printStackTrace(); + } + } + + queue.clear(); + } + } + + public static String getIndexDir() { + try { + return new File(".").getCanonicalPath() + INDEX_PATH; + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + return null; + } + } + + private void initIndexWriter(String dir) throws Exception { + + Directory indexDir = null; + + try { + indexDir = FSDirectory.open(new File(dir + INDEX_PATH)); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + Version luceneVersion = Version.LUCENE_46; + IndexWriterConfig luceneConfig = new IndexWriterConfig(luceneVersion, + new StandardAnalyzer(luceneVersion)); + luceneConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + + indexWriter = new IndexWriter(indexDir, luceneConfig); + + } + + void close() { + try { + indexWriter.commit(); + indexWriter.close(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + public static String getCategoryFromFilePath(String path){ + String className = null; + for (String d : domains) { + if (path.indexOf("/"+d+"/") > -1) { + className = d; + break; + } + } + return className; + } + + public static void main(String[] args) { + ClassifierTrainingSetIndexer indexer = null; + if (args!=null && args.length==1){ + String relativeDirWithTrainingCorpus = args[0]; + // expect corpus relative to 'resource' directory, such as 'training_corpus' + if (!relativeDirWithTrainingCorpus.startsWith("/")) + relativeDirWithTrainingCorpus = "/"+relativeDirWithTrainingCorpus; + indexer = new ClassifierTrainingSetIndexer(relativeDirWithTrainingCorpus); + } else { + // expect corpus in the default location, "/training_corpus" in the resource directory + indexer = new ClassifierTrainingSetIndexer(); + } + try { + indexer.indexTrainingSet(); + } catch (Exception e) { + e.printStackTrace(); + } + indexer.close(); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java new file mode 100644 index 0000000..b4abbb9 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.doc_classifier; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Scanner; + +import opennlp.tools.similarity.apps.utils.CountItemsList; +import opennlp.tools.similarity.apps.utils.ValueSortMap; +import opennlp.tools.textsimilarity.TextProcessor; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; +import org.json.JSONObject; + +public class DocClassifier { + public static final String DOC_CLASSIFIER_KEY = "doc_class"; + public static String resourceDir = null; + public static final Log logger = LogFactory.getLog(DocClassifier.class); + private Map<String, Float> scoredClasses = new HashMap<String, Float>(); + + + public static Float MIN_TOTAL_SCORE_FOR_CATEGORY = 0.3f; //3.0f; + protected static IndexReader indexReader = null; + protected static IndexSearcher indexSearcher = null; + // resource directory plus the index folder + private static final String INDEX_PATH = resourceDir + + ClassifierTrainingSetIndexer.INDEX_PATH; + + // http://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm + private static final int MAX_DOCS_TO_USE_FOR_CLASSIFY = 10, // 10 similar + // docs for + // nearest + // neighbor + // settings + + MAX_CATEG_RESULTS = 2; + private static final float BEST_TO_NEX_BEST_RATIO = 2.0f; + // to accumulate classif results + private CountItemsList<String> localCats = new CountItemsList<String>(); + private int MAX_TOKENS_TO_FORM = 30; + private String CAT_COMPUTING = "computing"; + public static final String DOC_CLASSIFIER_MAP = "doc_classifier_map"; + private static final int MIN_SENTENCE_LENGTH_TO_CATEGORIZE = 60; // if + // sentence + // is + // shorter, + // should + // not + // be + // used + // for + // classification + private static final int MIN_CHARS_IN_QUERY = 30; // if combination of + // keywords is shorter, + // should not be used + // for classification + + // these are categories from the index + public static final String[] categories = new String[] { "legal", "health", + "finance", "computing", "engineering", "business" }; + + static { + synchronized (DocClassifier.class) { + Directory indexDirectory = null; + + try { + indexDirectory = FSDirectory.open(new File(INDEX_PATH)); + } catch (IOException e2) { + logger.error("problem opening index " + e2); + } + try { + indexReader = DirectoryReader.open(indexDirectory); + indexSearcher = new IndexSearcher(indexReader); + } catch (IOException e2) { + logger.error("problem reading index \n" + e2); + } + } + } + + public DocClassifier(String inputFilename, JSONObject inputJSON) { + scoredClasses = new HashMap<String, Float>(); + } + + /* returns the class name for a sentence */ + private List<String> classifySentence(String queryStr) { + + List<String> results = new ArrayList<String>(); + // too short of a query + if (queryStr.length() < MIN_CHARS_IN_QUERY) { + return results; + } + + Analyzer std = new StandardAnalyzer(Version.LUCENE_46); + QueryParser parser = new QueryParser(Version.LUCENE_46, "text", std); + parser.setDefaultOperator(QueryParser.Operator.OR); + Query query = null; + try { + query = parser.parse(queryStr); + + } catch (ParseException e2) { + + return results; + } + TopDocs hits = null; // TopDocs search(Query query, int n) + // Finds the top n hits for query. + try { + hits = indexSearcher + .search(query, MAX_DOCS_TO_USE_FOR_CLASSIFY + 2); + } catch (IOException e1) { + logger.error("problem searching index \n" + e1); + } + logger.debug("Found " + hits.totalHits + " hits for " + queryStr); + int count = 0; + + + for (ScoreDoc scoreDoc : hits.scoreDocs) { + Document doc = null; + try { + doc = indexSearcher.doc(scoreDoc.doc); + } catch (IOException e) { + logger.error("Problem searching training set for classif \n" + + e); + continue; + } + String flag = doc.get("class"); + + Float scoreForClass = scoredClasses.get(flag); + if (scoreForClass == null) + scoredClasses.put(flag, scoreDoc.score); + else + scoredClasses.put(flag, scoreForClass + scoreDoc.score); + + logger.debug(" <<categorized as>> " + flag + " | score=" + + scoreDoc.score + " \n text =" + doc.get("text") + "\n"); + + if (count > MAX_DOCS_TO_USE_FOR_CLASSIFY) { + break; + } + count++; + } + try { + scoredClasses = ValueSortMap.sortMapByValue(scoredClasses, false); + List<String> resultsAll = new ArrayList<String>( + scoredClasses.keySet()), resultsAboveThresh = new ArrayList<String>(); + for (String key : resultsAll) { + if (scoredClasses.get(key) > MIN_TOTAL_SCORE_FOR_CATEGORY) + resultsAboveThresh.add(key); + else + logger.debug("Too low score of " + scoredClasses.get(key) + + " for category = " + key); + } + + int len = resultsAboveThresh.size(); + if (len > MAX_CATEG_RESULTS) + results = resultsAboveThresh.subList(0, MAX_CATEG_RESULTS); // get + // maxRes + // elements + else + results = resultsAboveThresh; + } catch (Exception e) { + logger.error("Problem aggregating search results\n" + e); + } + if (results.size() < 2) + return results; + + // if two categories, one is very high and another is relatively low + if (scoredClasses.get(results.get(0)) + / scoredClasses.get(results.get(1)) > BEST_TO_NEX_BEST_RATIO) // second + // best + // is + // much + // worse + return results.subList(0, 1); + else + return results; + + } + + + + + public static String formClassifQuery(String pageContentReader, int maxRes) { + + // We want to control which delimiters we substitute. For example '_' & + // \n we retain + pageContentReader = pageContentReader.replaceAll("[^A-Za-z0-9 _\\n]", + ""); + + Scanner in = new Scanner(pageContentReader); + in.useDelimiter("\\s+"); + Map<String, Integer> words = new HashMap<String, Integer>(); + + while (in.hasNext()) { + String word = in.next(); + if (!StringUtils.isAlpha(word) || word.length() < 4) + continue; + + if (!words.containsKey(word)) { + words.put(word, 1); + } else { + words.put(word, words.get(word) + 1); + } + } + in.close(); + words = ValueSortMap.sortMapByValue(words, false); + List<String> resultsAll = new ArrayList<String>(words.keySet()), results = null; + + int len = resultsAll.size(); + if (len > maxRes) + results = resultsAll.subList(len - maxRes, len - 1); // get maxRes + // elements + else + results = resultsAll; + + return results.toString().replaceAll("(\\[|\\]|,)", " ").trim(); + } + + public void close() { + try { + indexReader.close(); + } catch (IOException e) { + logger.error("Problem closing index \n" + e); + } + } + + + /* + * Main entry point for classifying sentences + */ + + public List<String> getEntityOrClassFromText(String content) { + + List<String> sentences = TextProcessor.splitToSentences(content); + List<String> classifResults; + + try { + for (String sentence : sentences) { + // If sentence is too short, there is a chance it is not form a + // main text area, + // but from somewhere else, so it is safer not to use this + // portion of text for classification + + if (sentence.length() < MIN_SENTENCE_LENGTH_TO_CATEGORIZE) + continue; + String query = formClassifQuery(sentence, MAX_TOKENS_TO_FORM); + classifResults = classifySentence(query); + if (classifResults != null && classifResults.size() > 0) { + for (String c : classifResults) { + localCats.add(c); + } + logger.debug(sentence + " => " + classifResults); + } + } + + } catch (Exception e) { + logger.error("Problem classifying sentence\n " + e); + } + + List<String> aggrResults = new ArrayList<String>(); + try { + + aggrResults = localCats.getFrequentTags(); + + logger.debug(localCats.getFrequentTags()); + } catch (Exception e) { + logger.error("Problem aggregating search results\n" + e); + } + return aggrResults; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java new file mode 100644 index 0000000..73b0d43 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.doc_classifier; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.channels.Channels; +import java.nio.channels.ReadableByteChannel; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; + +import opennlp.tools.similarity.apps.BingQueryRunner; +import opennlp.tools.similarity.apps.utils.PageFetcher; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.json.JSONObject; + +/* + * This utility gets 'training_corpus' as input and creates a new version of training_corpus with verified files. + * Verified => classified by existing training set as only belonging to its target category, no other categories, not empty. + */ +public class DocClassifierTrainingSetMultilingualExtender { + private static final String LANG_TEMPL = "l_a_n_g"; + private String wikiUrlsTemplate = "https://"+LANG_TEMPL+".wikipedia.org/wiki/"; + + public static String projectHome = new File(".").getAbsolutePath().replace("contentinspection/.", ""); + public static String resourceDir = new File(".").getAbsolutePath().replace("/.", "") + "/src/main/resources"; + DocClassifier classifier = null; + private String sourceDir = null, destinationDir = null; + //interwiki-fr"><a href="http://fr.wikipedia.org/wiki/Niveau_d%27%C3%A9nergie" title="Niveau d'énergie â French" lang="fr" + private static String[][] multilingualTokens = new String[][]{ + {"interwiki-fr\"><a href=\"", "lang=\"fr\""}, + {"interwiki-es\"><a href=\"", "lang=\"es\""}, + {"interwiki-de\"><a href=\"", "lang=\"de\""}, + }; + + private static String[] langs = new String[]{ "fr", "es", "de"}; + + protected ArrayList<File> queue = new ArrayList<File>(); + + protected Tika tika = new Tika(); + public DocClassifierTrainingSetMultilingualExtender(String resource) { + + classifier = new DocClassifier("", new JSONObject()); + + } + private int FRAGMENT_LENGTH = 500; + + + protected void addFiles(File file) { + + try { + if (!file.exists()) { + System.out.println(file + " does not exist."); + } + if (file.isDirectory()) { + for (File f : file.listFiles()) { + try { + addFiles(f); + } catch (Exception e) { + } + } + } else { + queue.add(file); + } + } catch (Exception e) { + + } + } + + public List<String> extractEntriesFromSpecial_Export(String filename){ + List<String> filteredEntries = new ArrayList<String>(); + String content=null; + try { + content = FileUtils.readFileToString(new File(filename)); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + String[] entries = StringUtils.substringsBetween(content, "[[", "]]"); + for(String e: entries){ + if (e.startsWith("Kategorie") || e.startsWith("Category") || e.startsWith("d:") || e.startsWith("User") + ||e.startsWith("Portal") ) + continue; + if (e.indexOf(':')>-1) + continue; + + if (e.indexOf(":")>-1) + continue; + int endofEntry = e.indexOf('|'); + if (endofEntry>-1) e = e.substring(0, endofEntry); + filteredEntries.add(e); + } + + filteredEntries = new ArrayList<String> (new HashSet<String>(filteredEntries)); + return filteredEntries; + } + + public void processDirectory(String fileName) throws IOException { + List<String[]> report = new ArrayList<String[]>(); + report.add(new String[] { "filename", "category", + "confirmed?" , + }); + + addFiles(new File(fileName)); + // FileUtils.deleteDirectory(new File(destinationDir)); + // FileUtils.forceMkdir(new File(destinationDir)); + + + for (File f : queue) { + String content = null; + try {// should be wiki page + //if (f.getName().toString().toLowerCase().indexOf(" wiki")<0 && + + // if ( f.getAbsolutePath().indexOf("wiki-new")<0) + // continue; + // should not be a page already derived by a link + if (f.getName().toString().toLowerCase().indexOf(".html_")>-1) + continue; + + System.out.println("processing "+f.getName()); + content = FileUtils.readFileToString(f, "utf-8"); + int langIndex =0; + for(String[] begEnd: multilingualTokens){ + String urlDirty = StringUtils.substringBetween(content, begEnd[0], begEnd[1]); + String url = StringUtils.substringBefore(urlDirty, "\""); + + if (url!=null){ + if (!url.startsWith("http:")) + url = "https:"+url; + + String[] parts = url.split("/"); + String multilingualName = parts[parts.length-1]; + String destFileName = f.getAbsolutePath().replace(sourceDir, destinationDir).replace(" - Wikipedia, the free encyclopedia.html", "-wiki")+"."+langs[langIndex]+"." + +"_"+multilingualName+".html"; + if (!new File(destFileName).exists()){ + saveDocFromTheWeb(url, destFileName); + System.out.println(f.getName()+ " => "+destFileName); + } + } else { + System.out.println("Unable to extract multilingual urls for'" +langs[langIndex] +"' from file "+ f.getCanonicalPath()); + } + langIndex++; + } + } catch (Exception ee) { + ee.printStackTrace(); + } + } + + + queue.clear(); + } + + private void copyURLToFile(URL url, File file) { + ReadableByteChannel rbc=null; + try { + rbc = Channels.newChannel(url.openStream()); + } catch (IOException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + FileOutputStream fos=null; + try { + fos = new FileOutputStream(file.getAbsolutePath()); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + try { + fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + public void crawlWikiOnTopic( String filename, String lang, String destinationDir){ + List<String> entries = extractEntriesFromSpecial_Export(filename); + for(String e: entries){ + String url = wikiUrlsTemplate.replace(LANG_TEMPL, lang) + e; + saveDocFromTheWeb(url, destinationDir+e.replace(' ', '_')+".html"); + } + } + + public static void saveDocFromTheWeb(String docUrl, String destinationFile) { + try { + URL url = new URL(docUrl); + InputStream is = url.openStream(); + if (!new File(destinationFile).exists()) { + new File(destinationFile).createNewFile(); + } + + OutputStream os = new FileOutputStream(destinationFile); + + + byte[] b = new byte[2048]; + int length; + + while ((length = is.read(b)) != -1) { + os.write(b, 0, length); + } + + is.close(); + os.close(); + } catch (MalformedURLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + + public static void main(String[] args) { + if (args.length < 2) { + System.err + .println("Verifier accepts two arguments: [0] - input 'training_corpus' folder, " + + "[1] - output 'training_corpus' folder . " + + "All paths should include category name as a part of full path string, such as '/computing/' " ); + System.exit(0); + } + + DocClassifierTrainingSetMultilingualExtender runner = new DocClassifierTrainingSetMultilingualExtender(null); + + if (args.length==2) { + runner.sourceDir = args[0]; runner.destinationDir = args[1]; + runner.sourceDir = + "/Users/borisgalitsky/Documents/svm_tk_july2015/milkyway/training_corpus_multilingual_verif"; + runner.destinationDir = + "/Users/borisgalitsky/Documents/new_corpus/milkyway/training_corpus_new_multilingual_refined"; + + try { + runner.processDirectory( runner.sourceDir); + } catch (IOException e) { + e.printStackTrace(); + } + } else { + runner.crawlWikiOnTopic("/Users/borisgalitsky/Downloads/Wikipedia-20150730124756.xml", + //Wikipedia-20150730053619.xml", + ////Wikipedia-20150730044602.xml", + //Wikipedia-20150729103933.xml", + //Wikipedia-20150729103933.xml", + // "Wikipedia-20150728193126.xml", + //Wikipedia-20150728183128.xml", + "en", + "/Users/borisgalitsky/Documents/merged_svm_tk/milkyway/training_corpus_new_multilingual/business/wiki/wiki-new/"); + } + + + } +} + +/* +/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/main/resources/docs/netflix + */ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java new file mode 100644 index 0000000..98debe6 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.doc_classifier; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import opennlp.tools.jsmlearning.ProfileReaderWriter; + +import org.apache.commons.io.FileUtils; +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.json.JSONObject; + +/* + * This utility gets 'training_corpus' as input and creates a new version of training_corpus with verified files. + * Verified => classified by existing training set as only belonging to its target category, no other categories, not empty. + */ +public class DocClassifierTrainingSetVerifier { + public static String projectHome = new File(".").getAbsolutePath(); + public static String resourceDir = new File(".").getAbsolutePath().replace("/.", "") + "/src/main/resources"; + DocClassifier classifier = null; + private String sourceDir = null, destinationDir = null; + + + protected ArrayList<File> queue = new ArrayList<File>(); + + protected Tika tika = new Tika(); + public DocClassifierTrainingSetVerifier(String resource) { + + + classifier = new DocClassifier("", new JSONObject()); + + } + private int FRAGMENT_LENGTH = 500; + + + protected void addFiles(File file) { + + try { + if (!file.exists()) { + System.out.println(file + " does not exist."); + } + if (file.isDirectory()) { + for (File f : file.listFiles()) { + try { + addFiles(f); + } catch (Exception e) { + } + } + } else { + queue.add(file); + } + } catch (Exception e) { + + } + } + + public void processDirectory(String fileName) throws IOException { + List<String[]> report = new ArrayList<String[]>(); + report.add(new String[] { "filename", "category", + "confirmed?" , + }); + + addFiles(new File(fileName)); + //FileUtils.deleteDirectory(new File(destinationDir)); + //FileUtils.forceMkdir(new File(destinationDir)); + + + for (File f : queue) { + String content = null; + try { + System.out.println("processing "+f.getName()); + + //if (f.getName().indexOf(".html")<0) + //continue; + classifier = new DocClassifier("", new JSONObject()); + + + content = tika.parseToString(f); + + //classifier.runExpressionsOnContent(content); + List<String> resultsClassif = classifier.getEntityOrClassFromText(content); + Boolean bRejected = true; + if (resultsClassif.size()==1 + && resultsClassif.get(0).equals( + ClassifierTrainingSetIndexer.getCategoryFromFilePath(f.getAbsolutePath()))){ + String destFileName = f.getAbsolutePath().replace(sourceDir, destinationDir); + FileUtils.copyFile(f, new File(destFileName)); + bRejected = false; + } else { + System.out.println("File "+ f.getAbsolutePath() + "\n classified as "+ + resultsClassif.toString() + " but should be " + ClassifierTrainingSetIndexer.getCategoryFromFilePath(f.getAbsolutePath()) ); + } + bRejected = !bRejected; + String fragment = content; + if (content.length() > FRAGMENT_LENGTH) + fragment = content.substring(0, FRAGMENT_LENGTH); + fragment = fragment.replaceAll("\n", " ").trim(); + report.add(new String[] { f.getName(), resultsClassif.toString(), ClassifierTrainingSetIndexer.getCategoryFromFilePath(f.getAbsolutePath()), + (bRejected).toString(), + fragment}); + ProfileReaderWriter.writeReport(report, "DocClassifierMultiLingRpt.csv"); + + } catch (TikaException e) { + System.out.println("Tika problem with file" + + f.getAbsolutePath()); + } catch (Exception ee) { + ee.printStackTrace(); + } + } + + + queue.clear(); + } + + public static void main(String[] args) { + if (args.length < 2) { + System.err + .println("Verifier accepts two arguments: [0] - input 'training_corpus' folder, " + + "[1] - output 'training_corpus' folder . " + + "All paths should include category name as a part of full path string, such as '/computing/' " ); + System.exit(0); + } + + DocClassifierTrainingSetVerifier runner = new DocClassifierTrainingSetVerifier(null); + runner.sourceDir = args[0]; runner.destinationDir = args[1]; + runner.sourceDir = + // "/Users/borisgalitsky/Documents/svm_tk_july2015/milkyway/eval_corpus_multiling"; + "/Users/borisgalitsky/Documents/merged_svm_tk/milkyway/training_corpus_new_multilingual"; + runner.destinationDir = + "/Users/borisgalitsky/Documents/svm_tk_july2015/milkyway/training_corpus_multilingual_verif"; + // "/Users/borisgalitsky/Documents/svm_tk_july2015/milkyway/eval_corpus_multiling_bogus"; + + try { + runner.processDirectory( runner.sourceDir); + } catch (IOException e) { + e.printStackTrace(); + } + + + } +} + +/* +/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/main/resources/docs/netflix + */ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java new file mode 100644 index 0000000..01aaa12 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java @@ -0,0 +1,100 @@ +package opennlp.tools.enron_email_recognizer; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.commons.io.FileUtils; + +public class EmailNormalizer { + protected ArrayList<File> queue = new ArrayList<File>(); + + protected void addFilesPos(File file) { + + if (!file.exists()) { + System.out.println(file + " does not exist."); + } + if (file.isDirectory()) { + for (File f : file.listFiles()) { + addFilesPos(f); + System.out.println(f.getName()); + } + } else { + queue.add(file); + } + } + + public static final String[] headers = new String[] { + "Message-ID:", + "Date:", + "From:", + "To:", + "Subject:", + "Mime-Version:", + "Content-T", + "X-From:", + "X-To:", + "X-cc:", + "X-bcc:", + "X-Folder:", + "X-Origin:", + "X-FileName", + "cc:", + "----", + }; + + public static final String[] prohibitedStrings = new String[] { + "@", "<", ">" + }; + + private String OrigFolder = "maildir_ENRON_EMAILS", NewFolder = "data"; + + + + public void normalizeAndWriteIntoANewFile(File f){ + String content=""; + try { + content = FileUtils.readFileToString(f); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + String[] lines = content.split("\n"); + StringBuffer buf = new StringBuffer(); + for(String l: lines){ + boolean bAccept = true; + for(String h: headers){ + if (l.startsWith(h)){ + bAccept = false; + } + } + for(String h: prohibitedStrings){ + if (l.indexOf(h)>0){ + bAccept = false; + } + } + if (bAccept) + buf.append(l+"\n"); + } + String directoryNew = f.getAbsolutePath().replace(OrigFolder, NewFolder); + try { + String fullFileNameNew = directoryNew +"txt"; + FileUtils.writeStringToFile(new File(fullFileNameNew), buf.toString()); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + public void normalizeDirectory(File f){ + addFilesPos(f); + for(File e: queue){ + normalizeAndWriteIntoANewFile(e); + } + } + + public static void main(String[] args){ + EmailNormalizer nrm = new EmailNormalizer(); + nrm.normalizeDirectory(new File("/Users/bgalitsky/Documents/ENRON/maildir_ENRON_EMAILS")); + } +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java new file mode 100644 index 0000000..9cb713f --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java @@ -0,0 +1,67 @@ +package opennlp.tools.enron_email_recognizer; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.commons.io.FileUtils; + +public class EmailTrainingSetFormer { + static String dataDir = "/Users/bgalitsky/Downloads/", + //enron_with_categories/", + fileListFile = "cats4_11-17.txt", + destinationDir = "/Users/bgalitsky/Documents/ENRON/data11_17/"; + + //enron_with_categories/5/70665.cats:4,10,1 + public static void createPosTrainingSet(){ + try { + List<String> lines = FileUtils.readLines(new File(dataDir+fileListFile)); + for(String l: lines){ + Integer endOfFname = l.indexOf('.'), + startOfFname = l.lastIndexOf('/'); + String filenameOld =dataDir+ l.substring(0, endOfFname)+".txt"; + + String content = normalize(new File(filenameOld)); + + String filenameNew = destinationDir + l.substring(startOfFname+1, endOfFname)+".txt"; + //FileUtils.copyFile(new File(filenameOld), new File(filenameNew)); + FileUtils.writeStringToFile(new File(filenameNew), content); + } + } catch (Exception e) { + e.printStackTrace(); + } + } + + private String origFolder = "maildir_ENRON_EMAILS", newFolder = "data11_17"; + + public static String normalize(File f){ + String content=""; + try { + content = FileUtils.readFileToString(f); + } catch (IOException e) { + e.printStackTrace(); + } + String[] lines = content.split("\n"); + StringBuffer buf = new StringBuffer(); + for(String l: lines){ + boolean bAccept = true; + for(String h: EmailNormalizer.headers){ + if (l.startsWith(h)){ + bAccept = false; + } + } + for(String h: EmailNormalizer.prohibitedStrings){ + if (l.indexOf(h)>0){ + bAccept = false; + } + } + if (bAccept) + buf.append(l+"\n"); + } + return buf.toString(); + } + + public static void main(String[] args){ + EmailTrainingSetFormer.createPosTrainingSet(); + } +}
