[50/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

bgalitsky Wed, 16 Nov 2016 01:11:43 -0800

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java
new file mode 100755
index 0000000..c8156ea
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/apps/object_dedup/SimilarityAccessorBase.java
@@ -0,0 +1,739 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.apps.object_dedup;
+
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import opennlp.tools.similarity.apps.BingQueryRunner;
+import opennlp.tools.similarity.apps.HitBase;
+import opennlp.tools.similarity.apps.utils.LevensteinDistanceFinder;
+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
+import opennlp.tools.similarity.apps.utils.Utils;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/* This is a template class for deduplicator */
+
+public class SimilarityAccessorBase
+{
+       private static final Logger LOG = 
LoggerFactory.getLogger(SimilarityAccessorBase.class);
+
+       public static final int MAX_EV_TO_RECOMM = 6;
+
+       private List<String> namesBothSides;
+
+       protected static final String[] englishPrepositions = new String[] { 
"a", "aboard", "about", "above", "absent",
+               "across", "after", "against", "along", "alongside", "among", 
"around", "as", "at", "before", "behind", "below",
+               "beneath", "between", "beyond", "but", "by", "despite", "down", 
"during", "except", "excluding", "failing",
+               "following", "for", "from", "in", "including", "inside", 
"into", "like", "near", "next", "of", "off", "on",
+               "onto", "only", "opposite", "out", "outside", "over", "pace", 
"past", "per", "since", "than", "through", "and",
+               "thru", "till", "to", "toward", "under", "up", "upon", 
"versus", "with", "within", "you", "must", "know",
+               "when" };
+
+       protected List<String> commonWordsInEventTitles = Arrays.asList(new 
String[] { "community", "party", "film",
+               "music", "exhibition", "kareoke", "guitar", "quartet", 
"reggae", "r&b", "band", "dj ", "piano", "pray",
+               "worship", "god", "training", "class", "development", 
"training", "class", "course", "our", "comedy", ",fun",
+               "musical", "group", "alliance", "session", "feeding", 
"introduction", "school", "conversation", "learning",
+               "nursery", "unity", "trivia", "chat", "conference", "tuition", 
"technology", "teen", "communication",
+               "reception", "management", "beginner", "beginning", 
"collabora", "reuninon", "political", "course", "age",
+               "ages", "through", "grade", "networking", "workshop", 
"demonstration", "tuning", "program", "summit",
+               "convention", "day", "night", "one", "two", "outfest", "three", 
"online", "writing", "seminar", "coach",
+               ",expo", "advanced", "beginner", "intermediate", "earn", 
"free", "ii", "iii", "skills", "skill", "artist",
+               "summer", "winter", "autumn", "spring", "camp", "vacation", 
"miscrosoft", "kid", "child", "kids", "children",
+               "every", "everyone", "dancer", "dancers", "senior", "seniors", 
"basic", "elementary", "outfest", "2008",
+               "2009", "2010", "2011", "2012", "monday", "tuesday", 
"wednesday", "thirsday", "friday", "saturday", "sunday",
+               "mondays", "tuesdays", "wednesdays", "thirsdays", "fridays", 
"saturdays", "sundays", "men" // ?
+       });
+
+       private BingQueryRunner webSearch = new BingQueryRunner();
+
+       private StringDistanceMeasurer stringDistanceMeasurer = new 
StringDistanceMeasurer();
+
+
+       public SimilarityAccessorBase()
+       {
+       }
+
+
+       public void init()
+       {
+               namesBothSides = getWordsThatShouldBeOnBothSidesEvents();
+       }
+
+       protected List<String> removeDollarWordAndNonAlphaFromList(List<String> 
list)
+       {
+               List<String> result = new ArrayList<String>();
+               Pattern p = 
Pattern.compile("^\\$(\\d{1,3}(\\,\\d{3})*|(\\d+))(\\.\\d{2})?$");
+               for (String w : list)
+               {
+                       if (!(p.matcher(w).find()) && 
StringUtils.isAlphanumeric(w) && (w.length() >= 3 || !StringUtils.isAlpha(w)))
+                               result.add(w);
+               }
+               return result;
+       }
+
+
+       public List<String> getWordsThatShouldBeOnBothSidesEvents()
+       {
+/*
+               names.addAll(Arrays.asList(new String[] { "woman", "man", 
"women", "men", "womans", "mans", "womens", "mens",
+                       "boy", "girl", "boys", "girls", "men's", "women's", 
"woman's", "ice", // for disney
+                       "flight", "intermediate", "advanced", "beginner",
+                       // "tour", TODO special consideration
+                       "helicopter", "sexual", "junior", "jr" }));
+                       */
+               return null;
+
+       }
+
+       protected Boolean applySemanticNameSimilarityRule(Object es1,
+               Object es2)
+       {
+               
+               //TODO check attributes of objects
+               /*
+               if (!(es1.getVenueName().endsWith(es2.getVenueName()) || 
es2.getVenueName().endsWith(es1.getVenueName())))
+                       return false;
+               if (Math.abs(es1.getStarttime().getTime() - 
es2.getStarttime().getTime()) > 100000)
+                       return false;
+                       */
+
+               return true;
+
+       }
+
+       // this rule extract "OF" part and treats it as a whole expression
+       protected void applySubPhraseExtractionRule(List<String> name1Tokens, 
List<String> name2Tokens)
+       {
+               if (name1Tokens.indexOf("of") > 0 && name2Tokens.indexOf("of") 
> 0)
+               {
+                       name1Tokens = extractMainNounPhrase(name1Tokens);
+                       name2Tokens = extractMainNounPhrase(name2Tokens);
+               }
+       }
+
+       private Boolean attemptShortTitlesSimilarityInWebSpace(String name1, 
String name2)
+       {
+
+               // first delimeter processing
+               String name1v = name1.replace("'", "").replace("-", " ");
+               String name2v = name2.replace("'", "").replace("-", " ");
+               String name1vv = name1.replace("'", "");
+               String name2vv = name2.replace("'", "");
+               String name1vvv = name1.replace("-", " ");
+               String name2vvv = name2.replace("-", " ");
+
+               if (name1.startsWith(name2) || name1vv.startsWith(name2) || 
name1.startsWith(name2v)
+                       || name1.startsWith(name2vv) || 
name1.startsWith(name2vvv) || name1v.startsWith(name2v)
+                       || name1v.startsWith(name2vv) || 
name2.startsWith(name1) || name2vv.startsWith(name1)
+                       || name2.startsWith(name1v) || 
name2vvv.startsWith(name1vv) || name2.startsWith(name1vvv)
+                       || name2v.startsWith(name1v) || 
name2v.startsWith(name1vv) || name1.endsWith(name2)
+                       || name1vv.endsWith(name2) || name1.endsWith(name2v) || 
name1.endsWith(name2vv) || name1.endsWith(name2vvv)
+                       || name1v.endsWith(name2v) || name1v.endsWith(name2vv) 
|| name2.endsWith(name1) || name2vv.endsWith(name1)
+                       || name2.endsWith(name1v) || name1vvv.endsWith(name2vv) 
|| name2.endsWith(name1vvv)
+                       || name2v.endsWith(name1v) || name2v.endsWith(name1vv))
+               {
+                       LOG.info("Found fuzzy substring of name1 and name2");
+                       return true;
+               }
+               if (name1.length() > 12 && name2.length() > 12)
+                       return false;
+
+               return areNamesSemanticallyCloseInWebSearchSpace(name1, name2, 
0.8f, false).isDecision();
+
+       }
+
+       public Boolean applyBothSidesRuleEvent(String name1, String name2)
+       {
+               List<String> name1Tokens = 
TextProcessor.fastTokenize(name1.toLowerCase(), false);
+               List<String> name2Tokens = 
TextProcessor.fastTokenize(name2.toLowerCase(), false);
+               // get unique names
+               List<String> name1TokensC = new ArrayList<String>(name1Tokens), 
name2TokensC = new ArrayList<String>(
+                       name2Tokens);
+               ;
+               name1TokensC.removeAll(name2Tokens);
+               name2TokensC.removeAll(name1Tokens);
+               // get all unique names
+               name1TokensC.addAll(name2TokensC);
+
+               name1TokensC.retainAll(namesBothSides);
+               name1Tokens.retainAll(name2Tokens);
+
+               if ((name1TokensC.size() > 0 && name1Tokens.size() < 3) || 
(name1TokensC.size() > 1 && name1Tokens.size() < 5))
+               { // 'mens == men; case !(name1TokensC.size()==2 && 
(name1TokensC.get(0).indexOf(name1TokensC.get(1))>-1 ||
+                       // name1TokensC.get(1).indexOf(name1TokensC.get(0))>-1 
))){
+                       LOG.info("Found required common word present on one 
side and not on the other: " + name1TokensC.toString()
+                               + " and less than 3 keywords overlap (or >1 
common words and less than 5 overl");
+                       return false;
+               }
+               else
+                       return true;
+       }
+
+       protected List<String> tokenizeAndStem(String input)
+       {
+
+               List<String> results = new ArrayList<String>();
+               List<String> toks = 
TextProcessor.fastTokenize(input.toLowerCase(), false);
+               for (String word : toks)
+               {
+                       try
+                       {
+                               if (word.equals("theatre"))
+                                       word = "theater";
+                               results.add(word);
+                       }
+                       catch (Exception e)
+                       {
+                               results.add(word);
+                       }
+               }
+               return results;
+       }
+
+       protected List<String> stemList(List<String> toks)
+       {
+
+               List<String> results = new ArrayList<String>();
+               for (String word : toks)
+               {
+                       try
+                       {
+                               if (word.equals("theatre"))
+                                       word = "theater";
+                               results.add(word);
+                       }
+                       catch (Exception e)
+                       {
+                               results.add(word);
+                       }
+               }
+               return results;
+       }
+
+       public List<String> removeVenuePart(ArrayList<String> toks)
+       {
+               List<String> results = new ArrayList<String>();
+               boolean bVenuePart = false;
+               for (String word : toks)
+               {
+                       // beginning of venue part
+                       if (word.equals("at") || word.equals("@"))
+                               bVenuePart = true;
+                       // end of venue part
+                       if (!StringUtils.isAlphanumeric(word) || 
word.startsWith("<punc"))
+                               bVenuePart = false;
+
+                       if (!bVenuePart && !word.startsWith("<punc"))
+                               results.add(word);
+
+               }
+               return results;
+       }
+
+       protected boolean isCapitalized(String lookup)
+       {
+               String[] titleWords = lookup.split(" ");
+               int count = 0;
+               for (String word : titleWords)
+               {
+                       if (word.length() < 2) // '-', '|', ':'
+                               break;
+
+                       if (word.equals(word.toLowerCase()) && 
(!Arrays.asList(englishPrepositions).contains(word))
+                               && word.length() > 3 && 
StringUtils.isAlphanumeric(word))
+                               continue; // was return false;
+                       if (count > 3)
+                               break;
+                       count++;
+               }
+               return true;
+       }
+
+       protected List<String> extractMainNounPhrase(List<String> name1Tokens)
+       {
+               List<String> results = new ArrayList<String>();
+               int ofPos = name1Tokens.indexOf("of");
+               List<String> ofList = name1Tokens.subList(ofPos + 1, 
name1Tokens.size() - 1);
+               // now iterate till next preposition towards the end of noun 
phrase
+               for (String preposCand : ofList)
+               {
+                       if 
(Arrays.asList(englishPrepositions).contains(preposCand))
+                               break;
+                       results.add(preposCand);
+               }
+               return results;
+
+       }
+
+       public boolean verifyEventAttributesPost(List<String> name1Tokens, 
List<String> name2Tokens)
+       {
+               String[] attributeNamesPost = { "age", "ages", "game", "games", 
"grade", "grades", "level", "levels", "vs",
+                       "vs.", "versus", "pottery", "competition", "contest", 
"skill", "skills", "day", "only", "basic", "class",
+                       "completed",
+                       // "tour", ?
+                       "advanced", "beginner", "intermediate", "flight", 
"workshop", "latin", "adobe", "ballet", "dinner",
+                       "breakfast", "lunch", "summer", // "canyon"
+                       "tfestival", "festival", "mfestival" };
+               try
+               {
+                       for (String attr : attributeNamesPost)
+                       {
+
+                               int agePos1 = name1Tokens.indexOf(attr);
+                               int agePos2 = name2Tokens.indexOf(attr);
+                               if (agePos1 > -1 && agePos2 > -1 && agePos1 < 
name1Tokens.size() - 1
+                                       && agePos2 < name2Tokens.size() - 1)
+                               {
+                                       double dist = 
LevensteinDistanceFinder.levensteinDistance(name1Tokens.get(agePos1 + 1),
+                                               name2Tokens.get(agePos2 + 1), 
1, 10, 1, 10);
+                                       if (!name1Tokens.get(agePos1 + 
1).equalsIgnoreCase(name2Tokens.get(agePos2 + 1))
+                                               && (dist > 2.99 || 
name1Tokens.get(agePos1 + 1).length() < 4))
+                                       {
+                                               LOG.info("Found disagreement in 
the attrib value for " + attr + " value = "
+                                                       + 
name1Tokens.get(agePos1 + 1) + " <=> " + name2Tokens.get(agePos2 + 1));
+                                               return false;
+                                       }
+                               }
+                       }
+               }
+               catch (Exception e)
+               {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               return true;
+       }
+
+       public boolean verifyEventAttributesPre(List<String> name1Tokens, 
List<String> name2Tokens)
+       {
+
+               String[] attributeNamesPre = { "hour", "vs", "vs.", "versus", 
"pottery", "program", "day", "only",
+                       // dance styles followed by a param
+                       "swing", "rumba", "samba", "doble",
+                       "violence", //
+                       // "level",
+                       "class", "classes", "kid", "kids", "test", "west", 
"summer_camp", "session", "tfestival", "festival",
+                       "mfestival" };
+               try
+               {
+                       for (String attr : attributeNamesPre)
+                       {
+                               int agePos1 = name1Tokens.indexOf(attr);
+                               int agePos2 = name2Tokens.indexOf(attr);
+                               if (agePos1 > 0 && agePos2 > 0)
+                               { // not the first word is attr name
+                                       if (!name1Tokens.get(agePos1 - 
1).equalsIgnoreCase(name2Tokens.get(agePos2 - 1))
+                                               && (agePos1 < 2 || 
!name1Tokens.get(agePos1 - 2).equalsIgnoreCase(name2Tokens.get(agePos2 - 1)))
+                                               &&
+                                               // ((agePos1<2 && agePos2 <2) 
|| !name1Tokens.get(agePos1 -
+                                               // 
2).equalsIgnoreCase(name2Tokens.get(agePos2 - 2 ))) &&
+                                               (agePos2 < 2 || 
!name1Tokens.get(agePos1 - 1).equalsIgnoreCase(name2Tokens.get(agePos2 - 2)))
+
+                                       )
+                                       {
+                                               LOG.info("Found disagreement in 
the attrib value for " + attr + " value = "
+                                                       + 
name1Tokens.get(agePos1 - 1) + " and " + name2Tokens.get(agePos2 - 1));
+                                               return false;
+                                       }
+                               }
+                       }
+               }
+               catch (Exception e)
+               {
+                       e.printStackTrace();
+               }
+               return true;
+       }
+
+       protected boolean bDifferentGroupOneSubnameOfAnother(String name1, 
String name2)
+       {
+               // first check a special case that both name1 and name2 are 
DIFFERENT groups at last.fm
+               Map<String, Integer> map1 = null; 
//LastFM_APIManager.extractTagsForArtist(name1);
+               Map<String, Integer> map2 = null; 
//LastFM_APIManager.extractTagsForArtist(name2);
+               if (map1 != null && map2 != null && map1.size() > 0 && 
map2.size() > 0)
+                       map1.entrySet().removeAll(map2.entrySet());
+               if (map1.size() > 0) // same or subset of tags => different 
groups
+                       return true;
+
+               return false;
+       }
+
+       public boolean applyBothSidesRule(String name1, String name2)
+       {
+               List<String> name1Tokens = 
TextProcessor.fastTokenize(name1.toLowerCase(), false);
+               List<String> name2Tokens = 
TextProcessor.fastTokenize(name2.toLowerCase(), false);
+               // get unique names
+               List<String> name1TokensC = new ArrayList<String>(name1Tokens), 
name2TokensC = new ArrayList<String>(
+                       name2Tokens);
+               ;
+               name1TokensC.removeAll(name2Tokens);
+               name2TokensC.removeAll(name1Tokens);
+               // get all unique names
+               name1TokensC.addAll(name2TokensC);
+
+               name1TokensC.retainAll(namesBothSides);
+               if (name1TokensC.size() > 0)
+                       return false;
+               else
+                       return true;
+       }
+
+       private boolean succeededMenWomenSportsRule(String name1, String name2)
+       {
+               List<String> name1Tokens = 
TextProcessor.fastTokenize(name1.toLowerCase(), false);
+               List<String> name2Tokens = 
TextProcessor.fastTokenize(name2.toLowerCase(), false);
+               if (name1Tokens.contains("men") || name2Tokens.contains("men") 
|| name1Tokens.contains("women")
+                       || name2Tokens.contains("women") || 
name1Tokens.contains("disney") || name2Tokens.contains("disney"))
+               { // all words should be the
+                       // same
+                       name1Tokens.removeAll(name2Tokens);
+                       
name1Tokens.removeAll(Arrays.asList(englishPrepositions));
+                       
name1Tokens.removeAll(Arrays.asList(commonWordsInEventTitles));
+                       if (name1Tokens.size() < 1)
+                               return true;
+
+                       return false;
+               }
+               else
+                       return true;
+
+       }
+
+       private boolean succeededSpecialGroupsSymphoniesRule(String name1, 
String name2)
+       {
+               List<String> name1Tokens = 
TextProcessor.fastTokenize(name1.toLowerCase(), false);
+               List<String> name2Tokens = 
TextProcessor.fastTokenize(name2.toLowerCase(), false);
+               if (name1Tokens.contains("orchestra") || 
name2Tokens.contains("symphony") || name2Tokens.contains("orchestra")
+                       || name1Tokens.contains("symphony") || 
name2Tokens.contains("band") || name1Tokens.contains("band")
+                       || name2Tokens.contains("trio") || 
name1Tokens.contains("trio") || name1Tokens.contains("soleil")
+                       || name2Tokens.contains("soleil") || 
name1Tokens.contains("disney") || name2Tokens.contains("disney")
+                       || name1Tokens.contains("lang") || 
name2Tokens.contains("lang")) // special group 'lang lang'
+               { // all words should be the
+                       // same
+                       List<String> name1TokensClone = new 
ArrayList<String>(name1Tokens);
+                       name1Tokens.removeAll(name2Tokens);
+                       name2Tokens.removeAll(name1TokensClone);
+                       name1Tokens.addAll(name2Tokens);
+                       
name1Tokens.removeAll(Arrays.asList(this.englishPrepositions));
+                       // 
name1Tokens.removeAll(Arrays.asList(this.commonWordsInEventTitles));
+                       if (name1Tokens.size() < 1)
+                               return true;
+
+                       return false;
+               }
+               else
+                       return true;
+
+       }
+
+       public int getAttemptedNameMerge(String name1, String name2)
+       {
+               name1 = name1.replaceAll("[a-z][A-Z]", 
"$0&$0").replaceAll(".&.", " ");
+               ; // suspected word merge if higher case is in the middle of 
word
+               name2 = name2.replaceAll("[a-z][A-Z]", 
"$0&$0").replaceAll(".&.", " ");
+
+               name1 = name1.toLowerCase();
+               name2 = name2.toLowerCase();
+               if (name1.equals(name2) || name1.startsWith(name2) || 
name2.startsWith(name1) || name1.endsWith(name2)
+                       || name1.endsWith(name2) || name1.indexOf(name2) > -1 
|| name1.indexOf(name2) > -1) // ??
+                       return 2;
+               String name2r = name2.replace(" ", "");
+               if (name1.equals(name2r) || name1.startsWith(name2r) || 
name1.startsWith(name2r) || name1.endsWith(name2r)
+                       || name1.endsWith(name2r))
+                       return 1;
+               String name1r = name1.replace(" ", "");
+               if (name1r.equals(name2r) || name1r.startsWith(name2r) || 
name1r.startsWith(name2) || name1r.endsWith(name2r)
+                       || name1r.endsWith(name2r) || name2r.equals(name1r) || 
name2r.startsWith(name1r)
+                       || name2r.startsWith(name1) || name2r.endsWith(name1r) 
|| name2r.endsWith(name2)
+
+               )
+                       return 1;
+
+               if (stringDistanceMeasurer.measureStringDistance(name1, name2) 
> 0.95)
+                       return 2;
+               if (stringDistanceMeasurer.measureStringDistance(name1, name2) 
> 0.70)
+                       return 1;
+               return 0;
+       }
+
+       private String normalizeGenderAndOtherAttributes(String name1)
+       {
+               name1 = Utils.convertToASCII(name1.replace("/", " 
").replace("w/", "with ")).replace('!', ' ').toLowerCase();
+
+               name1 = name1.replace("woman", "women").replace("womans", 
"women").replace("womens", "women")
+                       .replace("women's", "women").replace("woman's", 
"women");
+               name1 = name1.replace(" man ", " men ").replace(" mans ", " men 
").replace(" men's ", " men ")
+                       .replace(" man's ", " men ").replace(" mens ", " men 
").replace("summer camp", "summer_camp")
+                       .replace("gaea theatre festival", "tfestival"); // need 
regexp for this
+               return name1;
+       }
+
+       /*
+        * Main semantic similarity function which applies boundary cases rule 
and focus on web mining rule The main
+        * criteria for a commonality between titles: to form an entity, 
searchable on the web
+        */
+       public DedupResult areNamesSemanticallyCloseWebMineCommonPart(String 
name1, String name2, String venue)
+       {
+               // normalize gender
+               name1 = normalizeGenderAndOtherAttributes(name1);
+               name2 = normalizeGenderAndOtherAttributes(name2);
+
+               Boolean bShortTitlesSimilarInWebSpace = 
attemptShortTitlesSimilarityInWebSpace(name1, name2);
+               if (bShortTitlesSimilarInWebSpace)
+                       return new DedupResult("Accepted as short title by web 
mining", 2, true);
+
+               StringBuffer reason = new StringBuffer();
+               List<String> venueToks = 
removeVenuePart(TextProcessor.fastTokenize(venue.toLowerCase(), false));
+
+               LOG.info("\nComputing similarity between name = '" + name1 + "' 
and name = '" + name2 + "'");
+               // convert titles into token lists
+               List<String> name1Tokens = 
removeVenuePart(TextProcessor.fastTokenize(name1.toLowerCase(), true));
+               List<String> name2Tokens = 
removeVenuePart(TextProcessor.fastTokenize(name2.toLowerCase(), true));
+               // applySubPhraseExtractionRule()
+               Boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, 
name2Tokens)
+                       && verifyEventAttributesPre(name1Tokens, name2Tokens);
+               if (!bSameAttrib)
+               {
+                       LOG.info("similar events but different attributes");
+                       return new DedupResult("similar events but different 
attributes", 0, false);
+               }
+
+               boolean bothSodesSuccess = applyBothSidesRuleEvent(name1, 
name2);
+               if (!bothSodesSuccess)
+               {
+                       return new DedupResult("Failed common words test for 
sports", 0, false);
+               }
+
+               float dist = (float) 
LevensteinDistanceFinder.levensteinDistance(name1, name2, 1, 10, 1, 10);
+               if (dist < 5.1)
+               {
+                       LOG.info("Found low LevensteinDistance for name1 and 
name2");
+                       return new DedupResult("Found low LevensteinDistance", 
2, true);
+               }
+
+               int nameMergeScore = getAttemptedNameMerge(name1, name2);
+               if (nameMergeScore > 0)
+               {
+                       LOG.info("Found low NameMerge Distance for name1 and 
name2");
+                       return new DedupResult("Found low  NameMerge Distance", 
2, true);
+               }
+
+               // todo take into account order
+               // form common sub-list of tokens
+               name1Tokens.retainAll(name2Tokens);
+               name1Tokens.removeAll(venueToks);
+
+               name1Tokens.removeAll(commonWordsInEventTitles);
+               name1Tokens.removeAll(Arrays.asList(englishPrepositions));
+               name1Tokens = removeDollarWordAndNonAlphaFromList(name1Tokens);
+               // todo : to use full string measure
+               // boundary case: too many words => just do counts
+               float commonPortion = (float) name1Tokens.size() / (float) 
name2Tokens.size();
+               if (commonPortion > 0.8 || name1Tokens.size() >= 4)
+               { // after typical
+                       // title words
+                       // are revomed 4
+                       // looks OK
+                       LOG.info("Accepted since substantial common part");
+                       return new DedupResult("Accepted since substantial 
common part", Math.max((int) (commonPortion * 5.0), 2),
+                               true);
+               }
+               // boundary case: no overlap
+               if (name1Tokens.size() < 1)
+               {
+                       LOG.info("Rejected since nothing in common");
+                       return new DedupResult("Rejected since nothing in 
common", 0, false);
+               }
+               // get from list of tokens back to words to get search 
expression
+               String entityExpression = name1Tokens.toString().replace('[', ' 
').replace(']', ' ').replace(',', ' ')
+                       .replace("  ", " ").trim();
+               /*
+                * // now try name merge reduced strings String 
entityExpression1 = name1TokensC.toString().replace('[',
+                * ' ').replace(']', ' ').replace(',', ' ') .replace("  ", " 
").trim(); String entityExpression2 =
+                * name2Tokens.toString().replace('[', ' ').replace(']', ' 
').replace(',', ' ') .replace("  ", " ").trim();
+                * 
+                * nameMergeScore = getAttemptedNameMerge(entityExpression1, 
entityExpression2); if (nameMergeScore>0){
+                * LOG.info("Found low NameMerge Distance for REDUCED name1 and 
name2"); return new
+                * DedupResult("Found low  NameMerge Distance REDUCED", 2, 
true);
+                * 
+                * }
+                */
+
+               // Before doing web mining, make sure overlap between titles is 
NOT a
+               // set of common english words (use the vocabulary)
+               // if all words are common, then NOT an entity
+               if (name1Tokens.size() < 2)
+               {
+                       boolean bCommonEnglishWord = false;
+                       for (String word : name1Tokens)
+                       {
+       //                      if (stopList.isCommonWord(word) /*&& 
mostFrequent1000Words.isMostFrequent1000Word(word)*/)
+       //                              bCommonEnglishWord = true;
+                       }
+
+                       if (bCommonEnglishWord)
+                       {
+                               LOG.info("Rejected common entity: common word = 
" + entityExpression);
+                               return new DedupResult("Rejected since common 
entity is common English word = " + entityExpression, 0,
+                                       false);
+                       }
+               }
+               // accept common expression
+               LOG.info("Formed common entity = " + entityExpression);
+               reason.append("Formed common entity = " + entityExpression + 
"\n");
+               // now go to the web / bing api with this common expression
+               List<HitBase> searchResult = 
webSearch.runSearch(entityExpression);
+               float entityScore = 0f;
+               if (searchResult != null)
+               {
+                       int count = 0;
+                       for (HitBase item : searchResult)
+                       {
+                               String lookup = item.getTitle();
+                               LOG.info("Bing hit title = '" + lookup + "'");
+                               reason.append("Bing hit title = '" + lookup + 
"'\n");
+                               if (count > 4)
+                                       break;
+                               count++;
+                               // if occurrence is not capitalized then 
rejected, do not take
+                               // into account in score
+                               if (!isCapitalized(lookup))
+                               {
+                                       LOG.info("Rejected hit title since not 
capitalized");
+                                       reason.append("Rejected hit title since 
not capitalized\n");
+                                       continue;
+                               }
+
+                               /*
+                                * if (lookup.indexOf('-')>0 ){ lookup = 
lookup.split("-")[0]; }
+                                */
+                               // now compute overlap between what found on 
the web for hit's
+                               // title and the common expression between 
events
+                               List<String> lookupTokens = 
tokenizeAndStem(lookup);
+                               lookupTokens.retainAll(stemList(name1Tokens));
+                               if (lookupTokens.size() >= name1Tokens.size())
+                                       // increment score if found hit title 
is acceptable
+                                       entityScore += 1.0;
+                               else
+                               {
+                                       LOG.info("Found hit title " + 
lookupTokens + " does not cover comonality expr = " + name1Tokens);
+                                       entityScore += 0.25;
+
+                               }
+
+                       }
+               }
+               return new DedupResult(reason.toString(), (int) entityScore, 
entityScore > 1.0);
+       }
+
+       public DedupResult areNamesSemanticallyCloseInWebSearchSpace(String 
name1, String name2, Float thresh, boolean bStem)
+       {
+
+               if (thresh == null || thresh == 0f)
+               {
+                       thresh = 0.8f;
+               }
+
+               // normalize gender
+               name1 = normalizeGenderAndOtherAttributes(name1);
+               name2 = normalizeGenderAndOtherAttributes(name2);
+
+               StringBuffer reason = new StringBuffer();
+
+               boolean bSportsOrOrchestra = 
!succeededMenWomenSportsRule(name1, name2);
+               if (bSportsOrOrchestra)
+                       return new DedupResult("Sports rule: different teams or 
teams of different venues", 0, false);
+
+               bSportsOrOrchestra = 
!succeededSpecialGroupsSymphoniesRule(name1, name2);
+               if (bSportsOrOrchestra)
+                       return new DedupResult("SpecialGroupsSymphoniesRule: 
different circus/band", 0, false);
+
+               LOG.info("\nComputing similarity between name = '" + name1 + "' 
and name = '" + name2 + "'");
+
+               List<String> name1Tokens = 
TextProcessor.fastTokenize(name1.toLowerCase(), true);
+               List<String> name2Tokens = 
TextProcessor.fastTokenize(name2.toLowerCase(), true);
+               Boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, 
name2Tokens)
+                       && verifyEventAttributesPre(name1Tokens, name2Tokens);
+               if (!bSameAttrib)
+               {
+                       LOG.info("similar events but different attributes");
+                       return new DedupResult("similar events but different 
attributes", 0, false);
+               }
+
+               List<HitBase> searchResult1 = webSearch.runSearch(name1);
+               List<HitBase> searchResult2 = webSearch.runSearch(name2);
+               int score = 0;
+               if (searchResult1 != null && searchResult2 != null)
+               {
+                       for (HitBase item1 : searchResult1)
+                       {
+                               if (item1.getUrl().indexOf("myspace") > -1 || 
item1.getUrl().indexOf("wiki") > -1)
+                                       continue;
+                               for (HitBase item2 : searchResult2)
+                               {
+                                       String lookup1 = 
item1.getTitle().replace("Facebook", "").replace("LinkedIn", "")
+                                               .replace("MySpace", "");
+                                       String lookup2 = 
item2.getTitle().replace("Facebook", "").replace("LinkedIn", "")
+                                               .replace("MySpace", "");
+                                       double d = 0;
+                                       if (bStem)
+                                               d = 
stringDistanceMeasurer.measureStringDistance(lookup1, lookup2);
+                                       else
+                                               d = 
stringDistanceMeasurer.measureStringDistanceNoStemming(lookup1, lookup2);
+                                       if (d > thresh) // 0.8)
+                                       {
+
+                                               reason.append("Found common 
search result title for group names '" + lookup1 + " < > "
+                                                       + lookup2 + " sim = " + 
d + "\n");
+                                               LOG.info(("Found common search 
result title for group names '" + lookup1 + " < > " + lookup2
+                                                       + " sim = " + d));
+                                               score++;
+                                       }
+
+                               }
+                       }
+               }
+
+               Boolean bothSidesSuccess = applyBothSidesRule(name1, name2);
+               if (!bothSidesSuccess)
+               {
+                       score = 1;
+                       reason.append("Failed common words test for sports");
+               }
+               if (score > 0)
+               {
+                       Boolean bDifferentGroup = 
bDifferentGroupOneSubnameOfAnother(name1, name2);
+                       if (bDifferentGroup)
+                       {
+                               score = 1;
+                               reason.append("Failed common words test for 
sports");
+                       }
+               }
+               return new DedupResult(reason.toString(), score, score > 1);
+       }
+}


http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier.zip
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier.zip 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier.zip
new file mode 100644
index 0000000..2227504
Binary files /dev/null and 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier.zip differ

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/ClassifierTrainingSetIndexer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/ClassifierTrainingSetIndexer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/ClassifierTrainingSetIndexer.java
new file mode 100644
index 0000000..ad851e3
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/ClassifierTrainingSetIndexer.java
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.doc_classifier;
+
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+import org.apache.tika.Tika;
+
+public class ClassifierTrainingSetIndexer {
+       public static String resourceDir = new 
File(".").getAbsolutePath().replace("/.", "") + "/src/main/resources";
+    public static String INDEX_PATH = "/classif",
+            CLASSIF_TRAINING_CORPUS_PATH = "/training_corpus";
+    protected ArrayList<File> queue = new ArrayList<File>();
+    Tika tika = new Tika();
+
+    IndexWriter indexWriter = null;
+    protected static String[] domains =  new String[] { "legal", "health",
+        "computing", "engineering", "business" };
+       private String absolutePathTrainingSet=null;
+
+    public ClassifierTrainingSetIndexer() {
+
+        try {
+            initIndexWriter(resourceDir);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    } 
+
+    public ClassifierTrainingSetIndexer(String absolutePathTrainingSet) {
+       this.absolutePathTrainingSet = absolutePathTrainingSet;
+        try {
+            initIndexWriter(resourceDir);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void indexTrainingSet() {
+        
+        try {
+               if (absolutePathTrainingSet==null)
+            indexFileOrDirectory(resourceDir
+                    + CLASSIF_TRAINING_CORPUS_PATH);
+               else
+                        indexFileOrDirectory(
+                         this.absolutePathTrainingSet);
+                       
+        } catch (IOException e1) {
+            e1.printStackTrace();
+        }
+        try {
+            indexWriter.commit();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+/*
+    private void indexTrainingSample(String text, String flag, int id)
+            throws IOException {
+
+        Document doc = new Document();
+        doc.add(new StringField("id", new Integer(id).toString(),
+                Field.Store.YES));
+        doc.add(new TextField("text", text.toLowerCase(), Field.Store.YES));
+        doc.add(new StringField("class", flag.toLowerCase(), Field.Store.YES));
+        indexWriter.addDocument(doc);
+
+    }
+*/
+    private void addFiles(File file) {
+
+        if (!file.exists()) {
+            System.out.println(file + " does not exist.");
+        }
+        if (file.isDirectory()) {
+            for (File f : file.listFiles()) {
+                if (f.getName().startsWith("."))
+                    continue;
+                addFiles(f);
+                System.out.println(f.getName());
+            }
+        } else {
+            queue.add(file);
+
+        }
+    }
+
+    // index last folder name, before filename itself
+
+    public void indexFileOrDirectory(String fileName) throws IOException {
+        addFiles(new File(fileName));
+
+        List<File> files = new ArrayList<File>(queue);
+        for (File f : files) {
+            if (!f.getName().endsWith(".xml")) {
+
+                try {
+                    Document doc = new Document();
+
+                    String name = f.getPath();
+                    String className = null;
+                    for (String d : domains) {
+                        if (name.indexOf(d) > -1) {
+                            className = d;
+                            break;
+                        }
+                    }
+
+                    try {
+                        doc.add(new TextField("text", tika.parse(f)));
+                    } catch (Exception e1) {
+                        e1.printStackTrace();
+                    }
+
+                    doc.add(new StringField("path", f.getPath(),
+                            Field.Store.YES));
+                    doc.add(new StringField("class", className, 
Field.Store.YES));
+                    try {
+
+                        indexWriter.addDocument(doc);
+
+                    } catch (Exception e) {
+                        e.printStackTrace();
+                        System.out.println("Could not add: " + f);
+                    }
+                } catch (Exception ee) {
+                    ee.printStackTrace();
+                }
+            } else { // for xml files
+                try {
+                    Document doc = new Document();
+
+                    String name = new String(f.getPath());
+                    String[] nparts = name.split("/");
+                    int len = nparts.length;
+                    name = nparts[len - 2];
+
+                    FileReader fr = new FileReader(f);
+                    doc.add(new TextField("text", fr));
+
+                    doc.add(new StringField("path", f.getPath(),
+                            Field.Store.YES));
+                    doc.add(new StringField("class", name, Field.Store.YES));
+                    try {
+
+                        indexWriter.addDocument(doc);
+
+                    } catch (Exception e) {
+                        e.printStackTrace();
+                        System.out.println("Could not add: " + f);
+                    } finally {
+                        fr.close();
+                    }
+                } catch (Exception ee) {
+                    ee.printStackTrace();
+                }
+            }
+
+            queue.clear();
+        }
+    }
+
+    public static String getIndexDir() {
+        try {
+            return new File(".").getCanonicalPath() + INDEX_PATH;
+        } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+            return null;
+        }
+    }
+
+    private void initIndexWriter(String dir) throws Exception {
+
+        Directory indexDir = null;
+
+        try {
+            indexDir = FSDirectory.open(new File(dir + INDEX_PATH));
+        } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+
+        Version luceneVersion = Version.LUCENE_46;
+        IndexWriterConfig luceneConfig = new IndexWriterConfig(luceneVersion,
+                new StandardAnalyzer(luceneVersion));
+        luceneConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
+
+        indexWriter = new IndexWriter(indexDir, luceneConfig);
+
+    }
+
+    void close() {
+        try {
+            indexWriter.commit();
+            indexWriter.close();
+        } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+    }
+    
+    public static String getCategoryFromFilePath(String path){
+       String className = null;
+        for (String d : domains) {
+            if (path.indexOf("/"+d+"/") > -1) {
+                className = d;
+                break;
+            }
+        }
+        return className;
+    }
+
+    public static void main(String[] args) {
+       ClassifierTrainingSetIndexer indexer = null;
+       if (args!=null && args.length==1){
+               String relativeDirWithTrainingCorpus = args[0];
+               // expect corpus relative to 'resource' directory, such as 
'training_corpus'
+               if (!relativeDirWithTrainingCorpus.startsWith("/"))
+                       relativeDirWithTrainingCorpus = 
"/"+relativeDirWithTrainingCorpus;
+               indexer = new 
ClassifierTrainingSetIndexer(relativeDirWithTrainingCorpus);
+       } else {
+               // expect corpus in the default location, "/training_corpus" in 
the resource directory
+               indexer = new ClassifierTrainingSetIndexer();
+       }
+        try {
+            indexer.indexTrainingSet();
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        indexer.close();
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
new file mode 100644
index 0000000..b4abbb9
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java
@@ -0,0 +1,306 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.doc_classifier;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+
+import opennlp.tools.similarity.apps.utils.CountItemsList;
+import opennlp.tools.similarity.apps.utils.ValueSortMap;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+import org.json.JSONObject;
+
+public class DocClassifier {
+       public static final String DOC_CLASSIFIER_KEY = "doc_class";
+       public static String resourceDir = null;
+       public static final Log logger = LogFactory.getLog(DocClassifier.class);
+       private Map<String, Float> scoredClasses = new HashMap<String, Float>();
+       
+
+       public static Float MIN_TOTAL_SCORE_FOR_CATEGORY = 0.3f; //3.0f;
+       protected static IndexReader indexReader = null;
+       protected static IndexSearcher indexSearcher = null;
+       // resource directory plus the index folder
+       private static final String INDEX_PATH = resourceDir
+                       + ClassifierTrainingSetIndexer.INDEX_PATH;
+
+       // http://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
+       private static final int MAX_DOCS_TO_USE_FOR_CLASSIFY = 10, // 10 
similar
+                       // docs for
+                       // nearest
+                       // neighbor
+                       // settings
+
+                       MAX_CATEG_RESULTS = 2;
+       private static final float BEST_TO_NEX_BEST_RATIO = 2.0f;
+       // to accumulate classif results
+       private CountItemsList<String> localCats = new CountItemsList<String>();
+       private int MAX_TOKENS_TO_FORM = 30;
+       private String CAT_COMPUTING = "computing";
+       public static final String DOC_CLASSIFIER_MAP = "doc_classifier_map";
+       private static final int MIN_SENTENCE_LENGTH_TO_CATEGORIZE = 60; // if
+       // sentence
+       // is
+       // shorter,
+       // should
+       // not
+       // be
+       // used
+       // for
+       // classification
+       private static final int MIN_CHARS_IN_QUERY = 30; // if combination of
+       // keywords is shorter,
+       // should not be used
+       // for classification
+
+       // these are categories from the index
+       public static final String[] categories = new String[] { "legal", 
"health",
+               "finance", "computing", "engineering", "business" };
+
+       static {
+               synchronized (DocClassifier.class) {
+                       Directory indexDirectory = null;
+
+                       try {
+                               indexDirectory = FSDirectory.open(new 
File(INDEX_PATH));
+                       } catch (IOException e2) {
+                               logger.error("problem opening index " + e2);
+                       }
+                       try {
+                               indexReader = 
DirectoryReader.open(indexDirectory);
+                               indexSearcher = new IndexSearcher(indexReader);
+                       } catch (IOException e2) {
+                               logger.error("problem reading index \n" + e2);
+                       }
+               }
+       }
+
+       public DocClassifier(String inputFilename, JSONObject inputJSON) {
+               scoredClasses = new HashMap<String, Float>();
+       }
+
+       /* returns the class name for a sentence */
+       private List<String> classifySentence(String queryStr) {
+
+               List<String> results = new ArrayList<String>();
+               // too short of a query
+               if (queryStr.length() < MIN_CHARS_IN_QUERY) {
+                       return results;
+               }
+
+               Analyzer std = new StandardAnalyzer(Version.LUCENE_46);
+               QueryParser parser = new QueryParser(Version.LUCENE_46, "text", 
std);
+               parser.setDefaultOperator(QueryParser.Operator.OR);
+               Query query = null;
+               try {
+                       query = parser.parse(queryStr);
+
+               } catch (ParseException e2) {
+
+                       return results;
+               }
+               TopDocs hits = null; // TopDocs search(Query query, int n)
+               // Finds the top n hits for query.
+               try {
+                       hits = indexSearcher
+                                       .search(query, 
MAX_DOCS_TO_USE_FOR_CLASSIFY + 2);
+               } catch (IOException e1) {
+                       logger.error("problem searching index \n" + e1);
+               }
+               logger.debug("Found " + hits.totalHits + " hits for " + 
queryStr);
+               int count = 0;
+               
+
+               for (ScoreDoc scoreDoc : hits.scoreDocs) {
+                       Document doc = null;
+                       try {
+                               doc = indexSearcher.doc(scoreDoc.doc);
+                       } catch (IOException e) {
+                               logger.error("Problem searching training set 
for classif \n"
+                                               + e);
+                               continue;
+                       }
+                       String flag = doc.get("class");
+
+                       Float scoreForClass = scoredClasses.get(flag);
+                       if (scoreForClass == null)
+                               scoredClasses.put(flag, scoreDoc.score);
+                       else
+                               scoredClasses.put(flag, scoreForClass + 
scoreDoc.score);
+
+                       logger.debug(" <<categorized as>> " + flag + " | score="
+                                       + scoreDoc.score + " \n text =" + 
doc.get("text") + "\n");
+
+                       if (count > MAX_DOCS_TO_USE_FOR_CLASSIFY) {
+                               break;
+                       }
+                       count++;
+               }
+               try {
+                       scoredClasses = 
ValueSortMap.sortMapByValue(scoredClasses, false);
+                       List<String> resultsAll = new ArrayList<String>(
+                                       scoredClasses.keySet()), 
resultsAboveThresh = new ArrayList<String>();
+                       for (String key : resultsAll) {
+                               if (scoredClasses.get(key) > 
MIN_TOTAL_SCORE_FOR_CATEGORY)
+                                       resultsAboveThresh.add(key);
+                               else
+                                       logger.debug("Too low score of " + 
scoredClasses.get(key)
+                                                       + " for category = " + 
key);
+                       }
+
+                       int len = resultsAboveThresh.size();
+                       if (len > MAX_CATEG_RESULTS)
+                               results = resultsAboveThresh.subList(0, 
MAX_CATEG_RESULTS); // get
+                       // maxRes
+                       // elements
+                       else
+                               results = resultsAboveThresh;
+               } catch (Exception e) {
+                       logger.error("Problem aggregating search results\n" + 
e);
+               }
+               if (results.size() < 2)
+                       return results;
+
+               // if two categories, one is very high and another is 
relatively low
+               if (scoredClasses.get(results.get(0))
+                               / scoredClasses.get(results.get(1)) > 
BEST_TO_NEX_BEST_RATIO) // second
+                       // best
+                       // is
+                       // much
+                       // worse
+                       return results.subList(0, 1);
+               else
+                       return results;
+
+       }
+
+       
+
+       
+       public static String formClassifQuery(String pageContentReader, int 
maxRes) {
+
+               // We want to control which delimiters we substitute. For 
example '_' &
+               // \n we retain
+               pageContentReader = pageContentReader.replaceAll("[^A-Za-z0-9 
_\\n]",
+                               "");
+
+               Scanner in = new Scanner(pageContentReader);
+               in.useDelimiter("\\s+");
+               Map<String, Integer> words = new HashMap<String, Integer>();
+
+               while (in.hasNext()) {
+                       String word = in.next();
+                       if (!StringUtils.isAlpha(word) || word.length() < 4)
+                               continue;
+
+                       if (!words.containsKey(word)) {
+                               words.put(word, 1);
+                       } else {
+                               words.put(word, words.get(word) + 1);
+                       }
+               }
+               in.close();
+               words = ValueSortMap.sortMapByValue(words, false);
+               List<String> resultsAll = new 
ArrayList<String>(words.keySet()), results = null;
+
+               int len = resultsAll.size();
+               if (len > maxRes)
+                       results = resultsAll.subList(len - maxRes, len - 1); // 
get maxRes
+                       // elements
+                       else
+                               results = resultsAll;
+
+               return results.toString().replaceAll("(\\[|\\]|,)", " ").trim();
+       }
+
+       public void close() {
+               try {
+                       indexReader.close();
+               } catch (IOException e) {
+                       logger.error("Problem closing index \n" + e);
+               }
+       }       
+       
+       
+       /*
+        * Main entry point for classifying sentences
+        */
+
+       public List<String> getEntityOrClassFromText(String content) {
+
+               List<String> sentences = 
TextProcessor.splitToSentences(content);
+               List<String> classifResults;
+
+               try {
+                       for (String sentence : sentences) {
+                               // If sentence is too short, there is a chance 
it is not form a
+                               // main text area,
+                               // but from somewhere else, so it is safer not 
to use this
+                               // portion of text for classification
+
+                               if (sentence.length() < 
MIN_SENTENCE_LENGTH_TO_CATEGORIZE)
+                                       continue;
+                               String query = formClassifQuery(sentence, 
MAX_TOKENS_TO_FORM);
+                               classifResults = classifySentence(query);
+                               if (classifResults != null && 
classifResults.size() > 0) {
+                                       for (String c : classifResults) {
+                                               localCats.add(c);
+                                       }
+                                       logger.debug(sentence + " =>  " + 
classifResults);
+                               }
+                       }
+
+               } catch (Exception e) {
+                       logger.error("Problem classifying sentence\n " + e);
+               }
+               
+               List<String> aggrResults = new ArrayList<String>();
+               try {
+
+                       aggrResults = localCats.getFrequentTags();
+
+                       logger.debug(localCats.getFrequentTags());
+               } catch (Exception e) {
+                       logger.error("Problem aggregating search results\n" + 
e);
+               }
+               return aggrResults;
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java
new file mode 100644
index 0000000..73b0d43
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.doc_classifier;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.channels.Channels;
+import java.nio.channels.ReadableByteChannel;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+
+import opennlp.tools.similarity.apps.BingQueryRunner;
+import opennlp.tools.similarity.apps.utils.PageFetcher;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.json.JSONObject;
+
+/*
+ * This utility gets 'training_corpus' as input and creates a new version of 
training_corpus with verified files.
+ * Verified => classified by existing training set as only belonging to its 
target category, no other categories, not empty.
+ */
+public class DocClassifierTrainingSetMultilingualExtender {
+       private static final String LANG_TEMPL = "l_a_n_g";
+       private String wikiUrlsTemplate = 
"https://"+LANG_TEMPL+".wikipedia.org/wiki/";;
+       
+       public static String projectHome = new 
File(".").getAbsolutePath().replace("contentinspection/.", "");
+       public static String resourceDir = new 
File(".").getAbsolutePath().replace("/.", "") + "/src/main/resources";
+       DocClassifier classifier = null;
+       private String sourceDir = null, destinationDir = null;
+       //interwiki-fr"><a 
href="http://fr.wikipedia.org/wiki/Niveau_d%27%C3%A9nergie"; title="Niveau 
d&#39;Ã©nergie â French" lang="fr" 
+       private static String[][] multilingualTokens = new String[][]{ 
+               {"interwiki-fr\"><a href=\"", "lang=\"fr\""},
+               {"interwiki-es\"><a href=\"", "lang=\"es\""},
+               {"interwiki-de\"><a href=\"", "lang=\"de\""},
+       };
+       
+       private static String[] langs = new String[]{ "fr", "es", "de"};
+
+       protected ArrayList<File> queue = new ArrayList<File>();
+
+       protected Tika tika = new Tika();
+       public DocClassifierTrainingSetMultilingualExtender(String resource) {
+
+               classifier = new DocClassifier("", new JSONObject());
+
+       }
+       private int FRAGMENT_LENGTH = 500;
+       
+
+       protected void addFiles(File file) {
+
+               try {
+                       if (!file.exists()) {
+                               System.out.println(file + " does not exist.");
+                       }
+                       if (file.isDirectory()) {
+                               for (File f : file.listFiles()) {
+                                       try {
+                                               addFiles(f);
+                                       } catch (Exception e) {
+                                       }
+                               }
+                       } else {
+                               queue.add(file);
+                       }
+               } catch (Exception e) {
+
+               }
+       }
+       
+       public List<String> extractEntriesFromSpecial_Export(String filename){
+               List<String> filteredEntries = new ArrayList<String>();
+               String content=null;
+               try {
+                       content = FileUtils.readFileToString(new 
File(filename));
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               String[] entries = StringUtils.substringsBetween(content, "[[", 
"]]");
+               for(String e: entries){
+                       if (e.startsWith("Kategorie") || 
e.startsWith("Category") || e.startsWith("d:") || e.startsWith("User") 
+                                       ||e.startsWith("Portal")  )
+                               continue;
+                       if (e.indexOf(':')>-1)
+                               continue;
+                       
+                       if (e.indexOf(":")>-1)
+                               continue;
+                       int endofEntry = e.indexOf('|');
+                       if (endofEntry>-1) e = e.substring(0, endofEntry);
+                       filteredEntries.add(e);
+               }
+               
+               filteredEntries = new ArrayList<String> (new 
HashSet<String>(filteredEntries));
+               return filteredEntries;
+       }
+
+       public void processDirectory(String fileName) throws IOException {
+               List<String[]> report = new ArrayList<String[]>();
+               report.add(new String[] { "filename", "category",
+                               "confirmed?" ,
+               });
+               
+               addFiles(new File(fileName));
+       //      FileUtils.deleteDirectory(new File(destinationDir));
+       //      FileUtils.forceMkdir(new File(destinationDir));
+               
+
+               for (File f : queue) {
+                       String content = null;
+                       try {// should be wiki page
+                               //if 
(f.getName().toString().toLowerCase().indexOf(" wiki")<0 && 
+                                               
+                       //      if (            
f.getAbsolutePath().indexOf("wiki-new")<0)
+                       //              continue;
+                               // should not be a page already derived by a 
link
+                               if 
(f.getName().toString().toLowerCase().indexOf(".html_")>-1)
+                                       continue;
+                               
+                               System.out.println("processing "+f.getName());
+                               content = FileUtils.readFileToString(f, 
"utf-8");
+                               int langIndex =0;
+                               for(String[] begEnd: multilingualTokens){
+                                       String urlDirty = 
StringUtils.substringBetween(content, begEnd[0], begEnd[1]);
+                                       String url = 
StringUtils.substringBefore(urlDirty, "\"");
+
+                                       if (url!=null){
+                                               if (!url.startsWith("http:"))
+                                                   url = "https:"+url;
+                                               
+                                               String[] parts  = 
url.split("/");
+                                               String multilingualName = 
parts[parts.length-1];
+                                               String destFileName = 
f.getAbsolutePath().replace(sourceDir, destinationDir).replace(" - Wikipedia, 
the free encyclopedia.html", "-wiki")+"."+langs[langIndex]+"."
+                                                               
+"_"+multilingualName+".html";
+                                               if (!new 
File(destFileName).exists()){
+                                                       saveDocFromTheWeb(url, 
destFileName);
+                                                       
System.out.println(f.getName()+ " => "+destFileName);
+                                               }
+                                       } else {
+                                               System.out.println("Unable to 
extract multilingual urls for'" +langs[langIndex] +"' from file "+ 
f.getCanonicalPath());
+                                       }
+                                       langIndex++;
+                               }
+                       } catch (Exception ee) {
+                               ee.printStackTrace();
+                       }
+               }
+
+
+               queue.clear();
+       }
+
+       private void copyURLToFile(URL url, File file) {
+               ReadableByteChannel rbc=null;
+               try {
+                       rbc = Channels.newChannel(url.openStream());
+               } catch (IOException e1) {
+                       // TODO Auto-generated catch block
+                       e1.printStackTrace();
+               }
+               FileOutputStream fos=null;
+               try {
+                       fos = new FileOutputStream(file.getAbsolutePath());
+               } catch (FileNotFoundException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               try {
+                       fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               
+       }
+       
+       public void crawlWikiOnTopic( String filename, String lang, String 
destinationDir){
+               List<String> entries = 
extractEntriesFromSpecial_Export(filename);
+               for(String e: entries){
+                       String url  = wikiUrlsTemplate.replace(LANG_TEMPL, 
lang) + e; 
+                       saveDocFromTheWeb(url, destinationDir+e.replace(' ', 
'_')+".html"); 
+               }
+       }
+       
+       public static void saveDocFromTheWeb(String docUrl, String 
destinationFile) {
+               try {
+                       URL url = new URL(docUrl);
+                       InputStream is = url.openStream();
+                       if (!new File(destinationFile).exists()) {
+                               new File(destinationFile).createNewFile();
+                       }
+
+                       OutputStream os = new FileOutputStream(destinationFile);
+
+
+                       byte[] b = new byte[2048];
+                       int length;
+
+                       while ((length = is.read(b)) != -1) {
+                               os.write(b, 0, length);
+                       }
+
+                       is.close();
+                       os.close();
+               } catch (MalformedURLException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               } catch (FileNotFoundException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+       }
+       
+
+       public static void main(String[] args) {
+               if (args.length < 2) {
+                       System.err
+                       .println("Verifier accepts two arguments: [0] - input 
'training_corpus' folder, "
+                                       + "[1] - output 'training_corpus' 
folder . "
+                                       + "All paths should include category 
name as a part of full path string, such as '/computing/' " );
+                       System.exit(0);
+               }
+
+               DocClassifierTrainingSetMultilingualExtender runner = new 
DocClassifierTrainingSetMultilingualExtender(null);
+               
+               if (args.length==2) {
+               runner.sourceDir = args[0]; runner.destinationDir = args[1];
+               runner.sourceDir =
+                               
"/Users/borisgalitsky/Documents/svm_tk_july2015/milkyway/training_corpus_multilingual_verif";
+               runner.destinationDir =
+                               
"/Users/borisgalitsky/Documents/new_corpus/milkyway/training_corpus_new_multilingual_refined";
+
+               try {
+                       runner.processDirectory( runner.sourceDir);
+               } catch (IOException e) {
+                       e.printStackTrace();
+               }
+               } else {  
+                       
runner.crawlWikiOnTopic("/Users/borisgalitsky/Downloads/Wikipedia-20150730124756.xml",
+                                       //Wikipedia-20150730053619.xml",
+                                       ////Wikipedia-20150730044602.xml",
+                                       //Wikipedia-20150729103933.xml",
+                                       //Wikipedia-20150729103933.xml",
+                                       // "Wikipedia-20150728193126.xml",
+                                       //Wikipedia-20150728183128.xml",
+                                       "en", 
+                                       
"/Users/borisgalitsky/Documents/merged_svm_tk/milkyway/training_corpus_new_multilingual/business/wiki/wiki-new/");
+               }
+
+
+       }
+}
+
+/*
+/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/main/resources/docs/netflix
+ */
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java
new file mode 100644
index 0000000..98debe6
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.doc_classifier;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.json.JSONObject;
+
+/*
+ * This utility gets 'training_corpus' as input and creates a new version of 
training_corpus with verified files.
+ * Verified => classified by existing training set as only belonging to its 
target category, no other categories, not empty.
+ */
+public class DocClassifierTrainingSetVerifier {
+       public static String projectHome = new File(".").getAbsolutePath();
+       public static String resourceDir = new 
File(".").getAbsolutePath().replace("/.", "") + "/src/main/resources";
+       DocClassifier classifier = null;
+       private String sourceDir = null, destinationDir = null;
+       
+
+       protected ArrayList<File> queue = new ArrayList<File>();
+
+       protected Tika tika = new Tika();
+       public DocClassifierTrainingSetVerifier(String resource) {
+
+               
+               classifier = new DocClassifier("", new JSONObject());
+
+       }
+       private int FRAGMENT_LENGTH = 500;
+
+
+       protected void addFiles(File file) {
+
+               try {
+                       if (!file.exists()) {
+                               System.out.println(file + " does not exist.");
+                       }
+                       if (file.isDirectory()) {
+                               for (File f : file.listFiles()) {
+                                       try {
+                                               addFiles(f);
+                                       } catch (Exception e) {
+                                       }
+                               }
+                       } else {
+                               queue.add(file);
+                       }
+               } catch (Exception e) {
+
+               }
+       }
+
+       public void processDirectory(String fileName) throws IOException {
+               List<String[]> report = new ArrayList<String[]>();
+               report.add(new String[] { "filename", "category",
+                               "confirmed?" ,
+               });
+               
+               addFiles(new File(fileName));
+               //FileUtils.deleteDirectory(new File(destinationDir));
+               //FileUtils.forceMkdir(new File(destinationDir));
+               
+
+               for (File f : queue) {
+                       String content = null;
+                       try {
+                               System.out.println("processing "+f.getName());
+                               
+                               //if (f.getName().indexOf(".html")<0)
+                                       //continue;
+                               classifier = new DocClassifier("", new 
JSONObject());
+
+
+                               content = tika.parseToString(f);
+
+                               //classifier.runExpressionsOnContent(content);
+                               List<String> resultsClassif = 
classifier.getEntityOrClassFromText(content);
+                               Boolean bRejected = true;
+                               if (resultsClassif.size()==1 
+                                               && resultsClassif.get(0).equals(
+                                                               
ClassifierTrainingSetIndexer.getCategoryFromFilePath(f.getAbsolutePath()))){
+                                       String destFileName = 
f.getAbsolutePath().replace(sourceDir, destinationDir);
+                                       FileUtils.copyFile(f, new 
File(destFileName));
+                                       bRejected = false;
+                               } else {
+                                       System.out.println("File "+ 
f.getAbsolutePath() + "\n classified as "+
+                                                       
resultsClassif.toString() + " but should be " + 
ClassifierTrainingSetIndexer.getCategoryFromFilePath(f.getAbsolutePath()) );
+                               }
+                               bRejected = !bRejected;
+                               String fragment = content;
+                               if (content.length() > FRAGMENT_LENGTH)
+                                       fragment = content.substring(0, 
FRAGMENT_LENGTH);
+                               fragment = fragment.replaceAll("\n", " 
").trim();
+                               report.add(new String[] { f.getName(),  
resultsClassif.toString(), 
ClassifierTrainingSetIndexer.getCategoryFromFilePath(f.getAbsolutePath()),
+                                               (bRejected).toString(),   
+                                               fragment});
+                               ProfileReaderWriter.writeReport(report,  
"DocClassifierMultiLingRpt.csv");
+
+                       } catch (TikaException e) {
+                               System.out.println("Tika problem with file"
+                                               + f.getAbsolutePath());
+                       } catch (Exception ee) {
+                               ee.printStackTrace();
+                       }
+               }
+
+
+               queue.clear();
+       }
+
+       public static void main(String[] args) {
+               if (args.length < 2) {
+                       System.err
+                       .println("Verifier accepts two arguments: [0] - input 
'training_corpus' folder, "
+                                       + "[1] - output 'training_corpus' 
folder . "
+                                       + "All paths should include category 
name as a part of full path string, such as '/computing/' " );
+                       System.exit(0);
+               }
+
+               DocClassifierTrainingSetVerifier runner = new 
DocClassifierTrainingSetVerifier(null);
+               runner.sourceDir = args[0]; runner.destinationDir = args[1];
+               runner.sourceDir =
+                       //      
"/Users/borisgalitsky/Documents/svm_tk_july2015/milkyway/eval_corpus_multiling";
+                               
"/Users/borisgalitsky/Documents/merged_svm_tk/milkyway/training_corpus_new_multilingual";
+               runner.destinationDir =
+                               
"/Users/borisgalitsky/Documents/svm_tk_july2015/milkyway/training_corpus_multilingual_verif";
+               //      
"/Users/borisgalitsky/Documents/svm_tk_july2015/milkyway/eval_corpus_multiling_bogus";
+
+               try {
+                       runner.processDirectory( runner.sourceDir);
+               } catch (IOException e) {
+                       e.printStackTrace();
+               }
+
+
+       }
+}
+
+/*
+/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/main/resources/docs/netflix
+ */
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java
new file mode 100644
index 0000000..01aaa12
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java
@@ -0,0 +1,100 @@
+package opennlp.tools.enron_email_recognizer;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.commons.io.FileUtils;
+
+public class EmailNormalizer {
+       protected ArrayList<File> queue = new ArrayList<File>();
+       
+       protected void addFilesPos(File file) {
+
+               if (!file.exists()) {
+                       System.out.println(file + " does not exist.");
+               }
+               if (file.isDirectory()) {
+                       for (File f : file.listFiles()) {
+                               addFilesPos(f);
+                               System.out.println(f.getName());
+                       }
+               } else {
+                       queue.add(file);
+               }
+       }
+       
+       public static final String[] headers = new String[] {
+               "Message-ID:",
+       "Date:",
+       "From:",
+       "To:",
+       "Subject:",
+       "Mime-Version:",
+       "Content-T",
+       "X-From:",
+       "X-To:",
+       "X-cc:",
+       "X-bcc:",
+       "X-Folder:",
+       "X-Origin:",
+       "X-FileName",
+       "cc:",
+       "----",
+       };
+       
+       public static final String[] prohibitedStrings = new String[] {
+               "@", "<", ">"
+       };
+
+       private String OrigFolder = "maildir_ENRON_EMAILS", NewFolder = "data";
+
+       
+       
+       public void normalizeAndWriteIntoANewFile(File f){
+               String content="";
+        try {
+               content = FileUtils.readFileToString(f);
+        } catch (IOException e) {
+               // TODO Auto-generated catch block
+               e.printStackTrace();
+        }
+               String[] lines = content.split("\n");
+               StringBuffer buf = new StringBuffer();
+               for(String l: lines){
+                       boolean bAccept = true;
+                       for(String h: headers){
+                               if (l.startsWith(h)){
+                                       bAccept = false;
+                               }
+                       }
+                       for(String h: prohibitedStrings){
+                               if (l.indexOf(h)>0){
+                                       bAccept = false;
+                               }
+                       }
+                       if (bAccept)
+                               buf.append(l+"\n");
+               }
+               String directoryNew = f.getAbsolutePath().replace(OrigFolder, 
NewFolder);
+               try {
+                       String fullFileNameNew = directoryNew +"txt";
+               FileUtils.writeStringToFile(new File(fullFileNameNew), 
buf.toString());
+        } catch (IOException e) {
+               // TODO Auto-generated catch block
+               e.printStackTrace();
+        }
+       }
+       
+       public void normalizeDirectory(File f){
+               addFilesPos(f);
+               for(File e: queue){
+                       normalizeAndWriteIntoANewFile(e);
+               }
+       }
+       
+       public static void main(String[] args){
+               EmailNormalizer nrm = new EmailNormalizer();
+               nrm.normalizeDirectory(new 
File("/Users/bgalitsky/Documents/ENRON/maildir_ENRON_EMAILS"));
+       }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java
new file mode 100644
index 0000000..9cb713f
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java
@@ -0,0 +1,67 @@
+package opennlp.tools.enron_email_recognizer;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.commons.io.FileUtils;
+
+public class EmailTrainingSetFormer {
+       static String dataDir = "/Users/bgalitsky/Downloads/",
+                       //enron_with_categories/",
+                       fileListFile = "cats4_11-17.txt",
+                       destinationDir = 
"/Users/bgalitsky/Documents/ENRON/data11_17/";
+
+       //enron_with_categories/5/70665.cats:4,10,1
+       public static void  createPosTrainingSet(){
+               try {
+                       List<String> lines = FileUtils.readLines(new 
File(dataDir+fileListFile));
+                       for(String l: lines){
+                               Integer endOfFname = l.indexOf('.'),
+                                               startOfFname = 
l.lastIndexOf('/');
+                               String filenameOld =dataDir+ l.substring(0, 
endOfFname)+".txt";
+
+                               String content = normalize(new 
File(filenameOld));
+
+                               String filenameNew = destinationDir  + 
l.substring(startOfFname+1, endOfFname)+".txt";
+                               //FileUtils.copyFile(new File(filenameOld), new 
File(filenameNew));
+                               FileUtils.writeStringToFile(new 
File(filenameNew), content);
+                       }
+               } catch (Exception e) {
+                       e.printStackTrace();
+               }
+       }
+
+       private String origFolder = "maildir_ENRON_EMAILS", newFolder = 
"data11_17";
+
+       public static String normalize(File f){
+               String content="";
+               try {
+                       content = FileUtils.readFileToString(f);
+               } catch (IOException e) {
+                       e.printStackTrace();
+               }
+               String[] lines = content.split("\n");
+               StringBuffer buf = new StringBuffer();
+               for(String l: lines){
+                       boolean bAccept = true;
+                       for(String h: EmailNormalizer.headers){
+                               if (l.startsWith(h)){
+                                       bAccept = false;
+                               }
+                       }
+                       for(String h: EmailNormalizer.prohibitedStrings){
+                               if (l.indexOf(h)>0){
+                                       bAccept = false;
+                               }
+                       }
+                       if (bAccept)
+                               buf.append(l+"\n");
+               }
+               return buf.toString();
+       }
+
+       public static void main(String[] args){
+               EmailTrainingSetFormer.createPosTrainingSet();
+       }
+}

[50/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

Reply via email to