Author: srowen
Date: Sun Oct  3 10:21:43 2010
New Revision: 1003945

URL: http://svn.apache.org/viewvc?rev=1003945&view=rev
Log:
MAHOUT-271

Modified:
    
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=1003945&r1=1003944&r2=1003945&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
 Sun Oct  3 10:21:43 2010
@@ -19,7 +19,9 @@ package org.apache.mahout.classifier.bay
 
 import java.io.IOException;
 import java.io.StringReader;
+import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Locale;
 import java.util.Set;
 import java.util.regex.Pattern;
@@ -48,24 +50,20 @@ public class WikipediaDatasetCreatorMapp
   private static final Logger log = 
LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
 
   private static final Pattern SPACE_NON_ALPHA_PATTERN = 
Pattern.compile("[\\s\\W]");
-
   private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern.compile("<text 
xml:space=\"preserve\">");
-
   private static final Pattern CLOSE_TEXT_TAG_PATTERN = 
Pattern.compile("</text>");
 
-  private Set<String> inputCategories;
-
+  private List<String> inputCategories;
+  private List<Pattern> inputCategoryPatterns;
   private boolean exactMatchOnly;
-
   private Analyzer analyzer;
 
   @Override
   protected void map(LongWritable key, Text value, Context context) throws 
IOException, InterruptedException {
-    StringBuilder contents = new StringBuilder();
     String document = value.toString();
     String catMatch = findMatchingCategory(document);
-
     if (!"Unknown".equals(catMatch)) {
+      StringBuilder contents = new StringBuilder(1000);
       document = 
StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN.matcher(
           
WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
       TokenStream stream = analyzer.tokenStream(catMatch, new 
StringReader(document));
@@ -82,33 +80,39 @@ public class WikipediaDatasetCreatorMapp
   @Override
   protected void setup(Context context) throws IOException, 
InterruptedException {
     super.setup(context);
+
     Configuration conf = context.getConfiguration();
-    try {
-      if (inputCategories == null) {
-        Set<String> newCategories = new HashSet<String>();
 
-        DefaultStringifier<Set<String>> setStringifier = new 
DefaultStringifier<Set<String>>(conf, GenericsUtil
-            .getClass(newCategories));
+    if (inputCategories == null) {
+      Set<String> newCategories = new HashSet<String>();
+      DefaultStringifier<Set<String>> setStringifier =
+          new DefaultStringifier<Set<String>>(conf, 
GenericsUtil.getClass(newCategories));
+      String categoriesStr = conf.get("wikipedia.categories", 
setStringifier.toString(newCategories));
+      Set<String> inputCategoriesSet = 
setStringifier.fromString(categoriesStr);
+      inputCategories = new ArrayList<String>(inputCategoriesSet);
+      inputCategoryPatterns = new ArrayList<Pattern>(inputCategories.size());
+      for (String inputCategory : inputCategories) {
+        inputCategoryPatterns.add(Pattern.compile(".*\\b" + inputCategory + 
"\\b.*"));
+      }
 
-        String categoriesStr = conf.get("wikipedia.categories", 
setStringifier.toString(newCategories));
-        inputCategories = setStringifier.fromString(categoriesStr);
+    }
 
-      }
-      exactMatchOnly = conf.getBoolean("exact.match.only", false);
-      if (analyzer == null) {
+    exactMatchOnly = conf.getBoolean("exact.match.only", false);
+
+    if (analyzer == null) {
+      try {
         String analyzerStr = conf.get("analyzer.class", 
WikipediaAnalyzer.class.getName());
         Class<? extends Analyzer> analyzerClass = 
Class.forName(analyzerStr).asSubclass(Analyzer.class);
         analyzer = analyzerClass.newInstance();
+      } catch (ClassNotFoundException e) {
+        throw new IllegalStateException(e);
+      } catch (IllegalAccessException e) {
+        throw new IllegalStateException(e);
+      } catch (InstantiationException e) {
+        throw new IllegalStateException(e);
       }
-    } catch (IOException ex) {
-      throw new IllegalStateException(ex);
-    } catch (ClassNotFoundException e) {
-      throw new IllegalStateException(e);
-    } catch (IllegalAccessException e) {
-      throw new IllegalStateException(e);
-    } catch (InstantiationException e) {
-      throw new IllegalStateException(e);
     }
+
     log.info("Configure: Input Categories size: {} Exact Match: {} Analyzer: 
{}",
              new Object[] {inputCategories.size(), exactMatchOnly, 
analyzer.getClass().getName()});
   }
@@ -127,8 +131,10 @@ public class WikipediaDatasetCreatorMapp
       if (exactMatchOnly && inputCategories.contains(category)) {
         return category;
       } else if (!exactMatchOnly) {
-        for (String inputCategory : inputCategories) {
-          if (category.contains(inputCategory)) { // we have an inexact match
+        for (int i = 0; i < inputCategories.size(); i++) {
+          String inputCategory = inputCategories.get(i);
+          Pattern inputCategoryPattern = inputCategoryPatterns.get(i);
+          if (inputCategoryPattern.matcher(category).matches()) { // inexact 
match with word boundary. 
             return inputCategory;
           }
         }


Reply via email to