Repository: mahout Updated Branches: refs/heads/master 03a5bb61e -> 6dd0c92dd
MAHOUT-1608: Add option in WikipediaToSequenceFile to remove category labels from documents. Remove redundant call to findMatchingCategory. closes apache/mahout#45 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/6dd0c92d Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/6dd0c92d Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/6dd0c92d Branch: refs/heads/master Commit: 6dd0c92ddb9dad2627e9ca5e28118865d6fba159 Parents: 03a5bb6 Author: Andrew Palumbo <[email protected]> Authored: Thu Aug 28 20:06:13 2014 -0400 Committer: Andrew Palumbo <[email protected]> Committed: Thu Aug 28 20:12:22 2014 -0400 ---------------------------------------------------------------------- CHANGELOG | 2 + .../mahout/text/WikipediaToSequenceFile.java | 23 ++++++++-- .../mahout/text/wikipedia/WikipediaMapper.java | 44 +++++++++++++++++--- 3 files changed, 59 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/6dd0c92d/CHANGELOG ---------------------------------------------------------------------- diff --git a/CHANGELOG b/CHANGELOG index dfccd95..310b5f0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,8 @@ Mahout Change Log Release 1.0 - unreleased + MAHOUT-1608: Add option in WikipediaToSequenceFile to remove category labels from documents (apalumbo) + MAHOUT-1604: Spark version of rowsimilarity driver and associated additions to SimilarityAnalysis.scala (pferrel) MAHOUT-1500: H2O Integration (Anand Avati via apalumbo) http://git-wip-us.apache.org/repos/asf/mahout/blob/6dd0c92d/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java b/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java index 7a1e40e..19f353c 100644 --- a/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java +++ b/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java @@ -88,12 +88,16 @@ public final class WikipediaToSequenceFile { Option allOpt = obuilder.withLongName("all") .withDescription("If set, Select all files. Default is false").withShortName("all").create(); - + + Option removeLabelOpt = obuilder.withLongName("removeLabels") + .withDescription("If set, remove [[Category:labels]] from document text after extracting label." + + "Default is false").withShortName("rl").create(); + Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt) .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(allOpt).withOption(helpOpt) - .create(); + .withOption(removeLabelOpt).create(); Parser parser = new Parser(); parser.setGroup(group); @@ -117,7 +121,13 @@ public final class WikipediaToSequenceFile { if (cmdLine.hasOption(allOpt)) { all = true; } - runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all); + + boolean removeLabels = false; + if (cmdLine.hasOption(removeLabelOpt)) { + removeLabels = true; + } + + runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all, removeLabels); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); @@ -144,17 +154,22 @@ public final class WikipediaToSequenceFile { * category string * @param all * if true select all categories + * @param removeLabels + * if true remove Category labels from document text after extracting. + * */ public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, - boolean all) throws IOException, InterruptedException, ClassNotFoundException { + boolean all, + boolean removeLabels) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("xmlinput.start", "<page>"); conf.set("xmlinput.end", "</page>"); conf.setBoolean("exact.match.only", exactMatchOnly); conf.setBoolean("all.files", all); + conf.setBoolean("remove.labels", removeLabels); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); http://git-wip-us.apache.org/repos/asf/mahout/blob/6dd0c92d/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java index 94cb12e..d880760 100644 --- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java +++ b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java @@ -59,6 +59,8 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> { private boolean all; + private boolean removeLabels; + @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { @@ -76,16 +78,23 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> { return; } + String catMatch = findMatchingCategory(document); if (!all) { - String catMatch = findMatchingCategory(document); if ("Unknown".equals(catMatch)) { return; } } - String catMatch = findMatchingCategory(document); + document = StringEscapeUtils.unescapeHtml4(document); - // write out in Bayes input style: key: /Category/document_name + if (removeLabels) { + document = removeCategoriesFromText(document); + // Reject documents with malformed tags + if (document == null) { + return; + } + } + // write out in Bayes input style: key: /Category/document_name String category = "/" + catMatch.toLowerCase(Locale.ENGLISH) + "/" + SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_"); @@ -104,9 +113,10 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> { String categoriesStr = conf.get("wikipedia.categories"); inputCategories = setStringifier.fromString(categoriesStr); exactMatchOnly = conf.getBoolean("exact.match.only", false); - all = conf.getBoolean("all.files", true); - log.info("Configure: Input Categories size: {} All: {} Exact Match: {}", - inputCategories.size(), all, exactMatchOnly); + all = conf.getBoolean("all.files", false); + removeLabels = conf.getBoolean("remove.labels",false); + log.info("Configure: Input Categories size: {} All: {} Exact Match: {} Remove Labels from Text: {}", + inputCategories.size(), all, exactMatchOnly, removeLabels); } private static String getDocument(String xml) { @@ -144,4 +154,26 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> { } return "Unknown"; } + + private String removeCategoriesFromText(String document) { + int startIndex = 0; + int categoryIndex; + try { + while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) { + int endIndex = document.indexOf("]]", categoryIndex); + if (endIndex >= document.length() || endIndex < 0) { + break; + } + document = document.replace(document.substring(categoryIndex, endIndex + 2), ""); + if (categoryIndex < document.length()) { + startIndex = categoryIndex; + } else { + break; + } + } + } catch(StringIndexOutOfBoundsException e) { + return null; + } + return document; + } }
