no longer in use

rawkintrevo Wed, 27 Jun 2018 07:51:55 -0700

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java 
b/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
deleted file mode 100644
index bed4640..0000000
--- 
a/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.DefaultStringifier;
-import org.apache.hadoop.io.Stringifier;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.util.GenericsUtil;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.iterator.FileLineIterable;
-import org.apache.mahout.text.wikipedia.WikipediaMapper;
-import org.apache.mahout.text.wikipedia.XmlInputFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Create and run the Wikipedia Dataset Creator.
- */
-public final class WikipediaToSequenceFile {
-
-  private static final Logger log = 
LoggerFactory.getLogger(WikipediaToSequenceFile.class);
-  
-  private WikipediaToSequenceFile() { }
-  
-  /**
-   * Takes in two arguments:
-   * <ol>
-   * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents 
live</li>
-   * <li>The output {@link org.apache.hadoop.fs.Path} where to write the 
classifier as a
-   * {@link org.apache.hadoop.io.SequenceFile}</li>
-   * </ol>
-   */
-  public static void main(String[] args)  throws IOException {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-    
-    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();
-    
-    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();
-    
-    Option categoriesOpt = obuilder.withLongName("categories").withArgument(
-      
abuilder.withName("categories").withMinimum(1).withMaximum(1).create()).withDescription(
-      "Location of the categories file.  One entry per line. "
-          + "Will be used to make a string match in Wikipedia Category 
field").withShortName("c").create();
-    
-    Option exactMatchOpt = obuilder.withLongName("exactMatch").withDescription(
-      "If set, then the category name must exactly match the "
-          + "entry in the categories file. Default is 
false").withShortName("e").create();
-    
-    Option allOpt = obuilder.withLongName("all")
-        .withDescription("If set, Select all files. Default is 
false").withShortName("all").create();
-
-    Option removeLabelOpt = obuilder.withLongName("removeLabels")
-        .withDescription("If set, remove [[Category:labels]] from document 
text after extracting label."
-          + "Default is false").withShortName("rl").create();
-
-    Option helpOpt = DefaultOptionCreator.helpOption();
-    
-    Group group = 
gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
-        
.withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(allOpt).withOption(helpOpt)
-        .withOption(removeLabelOpt).create();
-    
-    Parser parser = new Parser();
-    parser.setGroup(group);
-    parser.setHelpOption(helpOpt);
-    try {
-      CommandLine cmdLine = parser.parse(args);
-      if (cmdLine.hasOption(helpOpt)) {
-        CommandLineUtil.printHelp(group);
-        return;
-      }
-      
-      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
-      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
-      
-      String catFile = "";
-      if (cmdLine.hasOption(categoriesOpt)) {
-        catFile = (String) cmdLine.getValue(categoriesOpt);
-      }
-      
-      boolean all = false;
-      if (cmdLine.hasOption(allOpt)) {
-        all = true;
-      }
-
-      boolean removeLabels = false;
-      if (cmdLine.hasOption(removeLabelOpt)) {
-          removeLabels = true;
-      }
-
-      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), 
all, removeLabels);
-    } catch (OptionException | InterruptedException | ClassNotFoundException 
e) {
-      log.error("Exception", e);
-      CommandLineUtil.printHelp(group);
-    }
-  }
-  
-  /**
-   * Run the job
-   * 
-   * @param input
-   *          the input pathname String
-   * @param output
-   *          the output pathname String
-   * @param catFile
-   *          the file containing the Wikipedia categories
-   * @param exactMatchOnly
-   *          if true, then the Wikipedia category must match exactly instead 
of simply containing the
-   *          category string
-   * @param all
-   *          if true select all categories
-   * @param removeLabels
-   *          if true remove Category labels from document text after 
extracting.
-   *
-   */
-  public static void runJob(String input,
-                            String output,
-                            String catFile,
-                            boolean exactMatchOnly,
-                            boolean all,
-                            boolean removeLabels) throws IOException, 
InterruptedException, ClassNotFoundException {
-    Configuration conf = new Configuration();
-    conf.set("xmlinput.start", "<page>");
-    conf.set("xmlinput.end", "</page>");
-    conf.setBoolean("exact.match.only", exactMatchOnly);
-    conf.setBoolean("all.files", all);
-    conf.setBoolean("remove.labels", removeLabels);
-    conf.set("io.serializations",
-             "org.apache.hadoop.io.serializer.JavaSerialization,"
-             + "org.apache.hadoop.io.serializer.WritableSerialization");
-    
-    Set<String> categories = new HashSet<>();
-    if (!catFile.isEmpty()) {
-      for (String line : new FileLineIterable(new File(catFile))) {
-        categories.add(line.trim().toLowerCase(Locale.ENGLISH));
-      }
-    }
-    
-    Stringifier<Set<String>> setStringifier =
-        new DefaultStringifier<>(conf, GenericsUtil.getClass(categories));
-    
-    String categoriesStr = setStringifier.toString(categories);    
-    conf.set("wikipedia.categories", categoriesStr);
-    
-    Job job = new Job(conf);
-    log.info("Input: {} Out: {} Categories: {} All Files: {}", input, output, 
catFile, all);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(Text.class);
-    FileInputFormat.setInputPaths(job, new Path(input));
-    Path outPath = new Path(output);
-    FileOutputFormat.setOutputPath(job, outPath);
-    job.setMapperClass(WikipediaMapper.class);
-    job.setInputFormatClass(XmlInputFormat.class);
-    job.setReducerClass(Reducer.class);
-    job.setOutputFormatClass(SequenceFileOutputFormat.class);
-    job.setJarByClass(WikipediaToSequenceFile.class);
-    
-    /*
-     * conf.set("mapred.compress.map.output", "true"); 
conf.set("mapred.map.output.compression.type",
-     * "BLOCK"); conf.set("mapred.output.compress", "true"); 
conf.set("mapred.output.compression.type",
-     * "BLOCK"); conf.set("mapred.output.compression.codec", 
"org.apache.hadoop.io.compress.GzipCodec");
-     */
-    HadoopUtil.delete(conf, outPath);
-
-    boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) {
-      throw new IllegalStateException("Job failed!");
-    }
-  
-  }
-}


http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
 
b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
deleted file mode 100644
index d50323d..0000000
--- 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.LowerCaseFilter;
-import org.apache.lucene.analysis.core.StopAnalyzer;
-import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
-
-
-public class WikipediaAnalyzer extends StopwordAnalyzerBase {
-  
-  public WikipediaAnalyzer() {
-    super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
-  }
-  
-  public WikipediaAnalyzer(CharArraySet stopSet) {
-    super(stopSet);
-  }
-
-  @Override
-  protected TokenStreamComponents createComponents(String fieldName) {
-    Tokenizer tokenizer = new WikipediaTokenizer();
-    TokenStream result = new StandardFilter(tokenizer);
-    result = new LowerCaseFilter(result);
-    result = new StopFilter(result, getStopwordSet());
-    return new TokenStreamComponents(tokenizer, result);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java
 
b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java
deleted file mode 100644
index 8214407..0000000
--- 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorDriver.java
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.DefaultStringifier;
-import org.apache.hadoop.io.Stringifier;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.util.GenericsUtil;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.iterator.FileLineIterable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Create and run the Wikipedia Dataset Creator.
- */
-public final class WikipediaDatasetCreatorDriver {
-  private static final Logger log = 
LoggerFactory.getLogger(WikipediaDatasetCreatorDriver.class);
-  
-  private WikipediaDatasetCreatorDriver() { }
-  
-  /**
-   * Takes in two arguments:
-   * <ol>
-   * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents 
live</li>
-   * <li>The output {@link org.apache.hadoop.fs.Path} where to write the 
classifier as a
-   * {@link org.apache.hadoop.io.SequenceFile}</li>
-   * </ol>
-   */
-  public static void main(String[] args) throws IOException, 
InterruptedException {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-    
-    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();
-    
-    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();
-    
-    Option categoriesOpt = 
obuilder.withLongName("categories").withRequired(true).withArgument(
-      
abuilder.withName("categories").withMinimum(1).withMaximum(1).create()).withDescription(
-      "Location of the categories file.  One entry per line. "
-          + "Will be used to make a string match in Wikipedia Category 
field").withShortName("c").create();
-    
-    Option exactMatchOpt = obuilder.withLongName("exactMatch").withDescription(
-      "If set, then the category name must exactly match the "
-          + "entry in the categories file. Default is 
false").withShortName("e").create();
-    Option analyzerOpt = 
obuilder.withLongName("analyzer").withRequired(false).withArgument(
-      
abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The analyzer to use, must have a no argument 
constructor").withShortName("a").create();
-    Option helpOpt = DefaultOptionCreator.helpOption();
-    
-    Group group = 
gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
-        
.withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(analyzerOpt).withOption(helpOpt)
-        .create();
-    
-    Parser parser = new Parser();
-    parser.setGroup(group);
-    try {
-      CommandLine cmdLine = parser.parse(args);
-      if (cmdLine.hasOption(helpOpt)) {
-        CommandLineUtil.printHelp(group);
-        return;
-      }
-      
-      String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
-      String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
-      String catFile = (String) cmdLine.getValue(categoriesOpt);
-      Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class;
-      if (cmdLine.hasOption(analyzerOpt)) {
-        String className = cmdLine.getValue(analyzerOpt).toString();
-        analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
-        // try instantiating it, b/c there isn't any point in setting it if
-        // you can't instantiate it
-        ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
-      }
-      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt),
-        analyzerClass);
-    } catch (OptionException e) {
-      log.error("Exception", e);
-      CommandLineUtil.printHelp(group);
-    } catch (ClassNotFoundException e) {
-      log.error("Exception", e);
-      CommandLineUtil.printHelp(group);
-    }
-  }
-  
-  /**
-   * Run the job
-   * 
-   * @param input
-   *          the input pathname String
-   * @param output
-   *          the output pathname String
-   * @param catFile
-   *          the file containing the Wikipedia categories
-   * @param exactMatchOnly
-   *          if true, then the Wikipedia category must match exactly instead 
of simply containing the
-   *          category string
-   */
-  public static void runJob(String input,
-                            String output,
-                            String catFile,
-                            boolean exactMatchOnly,
-                            Class<? extends Analyzer> analyzerClass)
-    throws IOException, InterruptedException, ClassNotFoundException {
-    Configuration conf = new Configuration();
-    conf.set("key.value.separator.in.input.line", " ");
-    conf.set("xmlinput.start", "<page>");
-    conf.set("xmlinput.end", "</page>");
-    conf.setBoolean("exact.match.only", exactMatchOnly);
-    conf.set("analyzer.class", analyzerClass.getName());
-    conf.set("io.serializations",
-             "org.apache.hadoop.io.serializer.JavaSerialization,"
-             + "org.apache.hadoop.io.serializer.WritableSerialization");
-    // Dont ever forget this. People should keep track of how hadoop conf
-    // parameters can make or break a piece of code
-    
-    Set<String> categories = new HashSet<>();
-    for (String line : new FileLineIterable(new File(catFile))) {
-      categories.add(line.trim().toLowerCase(Locale.ENGLISH));
-    }
-    
-    Stringifier<Set<String>> setStringifier =
-        new DefaultStringifier<>(conf, GenericsUtil.getClass(categories));
-    
-    String categoriesStr = setStringifier.toString(categories);
-    
-    conf.set("wikipedia.categories", categoriesStr);
-    
-    Job job = new Job(conf);
-    log.info("Input: {} Out: {} Categories: {}", input, output, catFile);
-    job.setJarByClass(WikipediaDatasetCreatorDriver.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(Text.class);
-    job.setMapperClass(WikipediaDatasetCreatorMapper.class);
-    //TODO: job.setNumMapTasks(100);
-    job.setInputFormatClass(XmlInputFormat.class);
-    job.setReducerClass(WikipediaDatasetCreatorReducer.class);
-    job.setOutputFormatClass(TextOutputFormat.class);
-    
-    FileInputFormat.setInputPaths(job, new Path(input));
-    Path outPath = new Path(output);
-    FileOutputFormat.setOutputPath(job, outPath);
-    HadoopUtil.delete(conf, outPath);
-    
-    boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) {
-      throw new IllegalStateException("Job failed!");
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java
 
b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java
deleted file mode 100644
index 50e5f37..0000000
--- 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorMapper.java
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import com.google.common.io.Closeables;
-import org.apache.commons.lang3.StringEscapeUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.DefaultStringifier;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.util.GenericsUtil;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.ClassUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Pattern;
-
-/**
- * Maps over Wikipedia xml format and output all document having the category 
listed in the input category
- * file
- * 
- */
-public class WikipediaDatasetCreatorMapper extends Mapper<LongWritable, Text, 
Text, Text> {
-
-  private static final Logger log = 
LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
-
-  private static final Pattern SPACE_NON_ALPHA_PATTERN = 
Pattern.compile("[\\s\\W]");
-  private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern.compile("<text 
xml:space=\"preserve\">");
-  private static final Pattern CLOSE_TEXT_TAG_PATTERN = 
Pattern.compile("</text>");
-
-  private List<String> inputCategories;
-  private List<Pattern> inputCategoryPatterns;
-  private boolean exactMatchOnly;
-  private Analyzer analyzer;
-
-  @Override
-  protected void map(LongWritable key, Text value, Context context) throws 
IOException, InterruptedException {
-    String document = value.toString();
-    document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN.matcher(
-        
OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
-    String catMatch = findMatchingCategory(document);
-    if (!"Unknown".equals(catMatch)) {
-      StringBuilder contents = new StringBuilder(1000);
-      TokenStream stream = analyzer.tokenStream(catMatch, new 
StringReader(document));
-      CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
-      stream.reset();
-      while (stream.incrementToken()) {
-        contents.append(termAtt.buffer(), 0, termAtt.length()).append(' ');
-      }
-      context.write(
-          new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
-          new Text(contents.toString()));
-      stream.end();
-      Closeables.close(stream, true);
-    }
-  }
-
-  @Override
-  protected void setup(Context context) throws IOException, 
InterruptedException {
-    super.setup(context);
-
-    Configuration conf = context.getConfiguration();
-
-    if (inputCategories == null) {
-      Set<String> newCategories = new HashSet<>();
-      DefaultStringifier<Set<String>> setStringifier =
-          new DefaultStringifier<>(conf, GenericsUtil.getClass(newCategories));
-      String categoriesStr = conf.get("wikipedia.categories", 
setStringifier.toString(newCategories));
-      Set<String> inputCategoriesSet = 
setStringifier.fromString(categoriesStr);
-      inputCategories = new ArrayList<>(inputCategoriesSet);
-      inputCategoryPatterns = new ArrayList<>(inputCategories.size());
-      for (String inputCategory : inputCategories) {
-        inputCategoryPatterns.add(Pattern.compile(".*\\b" + inputCategory + 
"\\b.*"));
-      }
-
-    }
-
-    exactMatchOnly = conf.getBoolean("exact.match.only", false);
-
-    if (analyzer == null) {
-      String analyzerStr = conf.get("analyzer.class", 
WikipediaAnalyzer.class.getName());
-      analyzer = ClassUtils.instantiateAs(analyzerStr, Analyzer.class);
-    }
-
-    log.info("Configure: Input Categories size: {} Exact Match: {} Analyzer: 
{}",
-             inputCategories.size(), exactMatchOnly, 
analyzer.getClass().getName());
-  }
-
-  private String findMatchingCategory(String document) {
-    int startIndex = 0;
-    int categoryIndex;
-    while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != 
-1) {
-      categoryIndex += 11;
-      int endIndex = document.indexOf("]]", categoryIndex);
-      if (endIndex >= document.length() || endIndex < 0) {
-        break;
-      }
-      String category = document.substring(categoryIndex, 
endIndex).toLowerCase(Locale.ENGLISH).trim();
-      // categories.add(category.toLowerCase());
-      if (exactMatchOnly && inputCategories.contains(category)) {
-        return category;
-      }
-      if (!exactMatchOnly) {
-        for (int i = 0; i < inputCategories.size(); i++) {
-          String inputCategory = inputCategories.get(i);
-          Pattern inputCategoryPattern = inputCategoryPatterns.get(i);
-          if (inputCategoryPattern.matcher(category).matches()) { // inexact 
match with word boundary.
-            return inputCategory;
-          }
-        }
-      }
-      startIndex = endIndex;
-    }
-    return "Unknown";
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java
 
b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java
deleted file mode 100644
index bf921fc..0000000
--- 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaDatasetCreatorReducer.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import java.io.IOException;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-
-/**
- * Can also be used as a local Combiner
- */
-public class WikipediaDatasetCreatorReducer extends Reducer<Text, Text, Text, 
Text> {
-
-  @Override
-  protected void reduce(Text key, Iterable<Text> values, Context context) 
throws IOException, InterruptedException {
-    // Key is label,word, value is the number of times we've seen this label
-    // word per local node. Output is the same
-    for (Text value : values) {
-      context.write(key, value);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
 
b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
deleted file mode 100644
index abd3a04..0000000
--- 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.lang3.StringEscapeUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.DefaultStringifier;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.util.GenericsUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Maps over Wikipedia xml format and output all document having the category 
listed in the input category
- * file
- * 
- */
-public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> {
-
-  private static final Logger log = 
LoggerFactory.getLogger(WikipediaMapper.class);
-
-  private static final Pattern SPACE_NON_ALPHA_PATTERN = 
Pattern.compile("[\\s]");
-
-  private static final String START_DOC = "<text xml:space=\"preserve\">";
-
-  private static final String END_DOC = "</text>";
-
-  private static final Pattern TITLE = 
Pattern.compile("<title>(.*)<\\/title>");
-
-  private static final String REDIRECT = "<redirect />";
-
-  private Set<String> inputCategories;
-
-  private boolean exactMatchOnly;
-
-  private boolean all;
-
-  private boolean removeLabels;
-
-  @Override
-  protected void map(LongWritable key, Text value, Context context) throws 
IOException, InterruptedException {
-
-    String content = value.toString();
-    if (content.contains(REDIRECT)) {
-      return;
-    }
-    String document;
-    String title;
-    try {
-      document = getDocument(content);
-      title = getTitle(content);
-    } catch (RuntimeException e) {
-      // TODO: reporter.getCounter("Wikipedia", "Parse errors").increment(1);
-      return;
-    }
-
-    String catMatch = findMatchingCategory(document);
-    if (!all) {
-      if ("Unknown".equals(catMatch)) {
-        return;
-      }
-    }
-
-    document = StringEscapeUtils.unescapeHtml4(document);    
-    if (removeLabels) {
-      document = removeCategoriesFromText(document);
-      // Reject documents with malformed tags
-      if (document == null) {
-        return;
-      }
-    }
-
-    // write out in Bayes input style: key: /Category/document_name
-    String category = "/" + catMatch.toLowerCase(Locale.ENGLISH) + "/" +
-        SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_");
-
-    context.write(new Text(category), new Text(document));
-  }
-
-  @Override
-  protected void setup(Context context) throws IOException, 
InterruptedException {
-    super.setup(context);
-    Configuration conf = context.getConfiguration();
- 
-    Set<String> newCategories = new HashSet<>();
-    DefaultStringifier<Set<String>> setStringifier =
-          new DefaultStringifier<>(conf, GenericsUtil.getClass(newCategories));
-
-    String categoriesStr = conf.get("wikipedia.categories");
-    inputCategories = setStringifier.fromString(categoriesStr);
-    exactMatchOnly = conf.getBoolean("exact.match.only", false);
-    all = conf.getBoolean("all.files", false);
-    removeLabels = conf.getBoolean("remove.labels",false);
-    log.info("Configure: Input Categories size: {} All: {} Exact Match: {} 
Remove Labels from Text: {}",
-            inputCategories.size(), all, exactMatchOnly, removeLabels);
-  }
-
-  private static String getDocument(String xml) {
-    int start = xml.indexOf(START_DOC) + START_DOC.length();
-    int end = xml.indexOf(END_DOC, start);
-    return xml.substring(start, end);
-  }
-
-  private static String getTitle(CharSequence xml) {
-    Matcher m = TITLE.matcher(xml);
-    return m.find() ? m.group(1) : "";
-  }
-
-  private String findMatchingCategory(String document) {
-    int startIndex = 0;
-    int categoryIndex;
-    while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != 
-1) {
-      categoryIndex += 11;
-      int endIndex = document.indexOf("]]", categoryIndex);
-      if (endIndex >= document.length() || endIndex < 0) {
-        break;
-      }
-      String category = document.substring(categoryIndex, 
endIndex).toLowerCase(Locale.ENGLISH).trim();
-      if (exactMatchOnly && inputCategories.contains(category)) {
-        return category.toLowerCase(Locale.ENGLISH);
-      }
-      if (!exactMatchOnly) {
-        for (String inputCategory : inputCategories) {
-          if (category.contains(inputCategory)) { // we have an inexact match
-            return inputCategory.toLowerCase(Locale.ENGLISH);
-          }
-        }
-      }
-      startIndex = endIndex;
-    }
-    return "Unknown";
-  }
-
-  private String removeCategoriesFromText(String document) {
-    int startIndex = 0;
-    int categoryIndex;
-    try {
-      while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != 
-1) {
-        int endIndex = document.indexOf("]]", categoryIndex);
-        if (endIndex >= document.length() || endIndex < 0) {
-          break;
-        }
-        document = document.replace(document.substring(categoryIndex, endIndex 
+ 2), "");
-        if (categoryIndex < document.length()) {
-          startIndex = categoryIndex;
-        } else {
-          break;
-        }
-      }
-    } catch(StringIndexOutOfBoundsException e) {
-      return null;
-    }
-    return document;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java
 
b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java
deleted file mode 100644
index fc065fe..0000000
--- 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.java
+++ /dev/null
@@ -1,234 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.net.URI;
-import java.text.DecimalFormat;
-import java.text.NumberFormat;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.compress.BZip2Codec;
-import org.apache.hadoop.io.compress.CompressionCodec;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.iterator.FileLineIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * <p>The Bayes example package provides some helper classes for training the 
Naive Bayes classifier
- * on the Twenty Newsgroups data. See {@code PrepareTwentyNewsgroups}
- * for details on running the trainer and
- * formatting the Twenty Newsgroups data properly for the training.</p>
- *
- * <p>The easiest way to prepare the data is to use the ant task in 
core/build.xml:</p>
- *
- * <p>{@code ant extract-20news-18828}</p>
- *
- * <p>This runs the arg line:</p>
- *
- * <p>{@code -p $\{working.dir\}/20news-18828/ -o 
$\{working.dir\}/20news-18828-collapse -a $\{analyzer\} -c UTF-8}</p>
- *
- * <p>To Run the Wikipedia examples (assumes you've built the Mahout Job 
jar):</p>
- *
- * <ol>
- *  <li>Download the Wikipedia Dataset. Use the Ant target: {@code ant 
enwiki-files}</li>
- *  <li>Chunk the data using the WikipediaXmlSplitter (from the Hadoop home):
- *   {@code bin/hadoop jar $MAHOUT_HOME/target/mahout-examples-0.x
- *   org.apache.mahout.classifier.bayes.WikipediaXmlSplitter
- *   -d $MAHOUT_HOME/examples/temp/enwiki-latest-pages-articles.xml
- *   -o $MAHOUT_HOME/examples/work/wikipedia/chunks/ -c 64}</li>
- * </ol>
- */
-public final class WikipediaXmlSplitter {
-
-  private static final Logger log = 
LoggerFactory.getLogger(WikipediaXmlSplitter.class);
-
-  private WikipediaXmlSplitter() { }
-
-  public static void main(String[] args) throws IOException {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
-    Option dumpFileOpt = 
obuilder.withLongName("dumpFile").withRequired(true).withArgument(
-      
abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The path to the wikipedia dump file (.bz2 or 
uncompressed)").withShortName("d").create();
-
-    Option outputDirOpt = 
obuilder.withLongName("outputDir").withRequired(true).withArgument(
-      
abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The output directory to place the splits in:\n"
-          + "local files:\n\t/var/data/wikipedia-xml-chunks 
or\n\tfile:///var/data/wikipedia-xml-chunks\n"
-          + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n"
-          + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n"
-          + "AWS S3 (native 
files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n")
-
-    .withShortName("o").create();
-
-    Option s3IdOpt = 
obuilder.withLongName("s3ID").withRequired(false).withArgument(
-      
abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()).withDescription("Amazon
 S3 ID key")
-        .withShortName("i").create();
-    Option s3SecretOpt = 
obuilder.withLongName("s3Secret").withRequired(false).withArgument(
-      
abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()).withDescription(
-      "Amazon S3 secret key").withShortName("s").create();
-
-    Option chunkSizeOpt = 
obuilder.withLongName("chunkSize").withRequired(true).withArgument(
-      
abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Size of the chunk, in megabytes").withShortName("c").create();
-    Option numChunksOpt = obuilder
-        .withLongName("numChunks")
-        .withRequired(false)
-        
.withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "The maximum number of chunks to create.  If specified, program will 
only create a subset of the chunks")
-        .withShortName("n").create();
-    Group group = 
gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt).withOption(
-      
chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt).create();
-
-    Parser parser = new Parser();
-    parser.setGroup(group);
-    CommandLine cmdLine;
-    try {
-      cmdLine = parser.parse(args);
-    } catch (OptionException e) {
-      log.error("Error while parsing options", e);
-      CommandLineUtil.printHelp(group);
-      return;
-    }
-
-    Configuration conf = new Configuration();
-    String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
-    String outputDirPath = (String) cmdLine.getValue(outputDirOpt);
-
-    if (cmdLine.hasOption(s3IdOpt)) {
-      String id = (String) cmdLine.getValue(s3IdOpt);
-      conf.set("fs.s3n.awsAccessKeyId", id);
-      conf.set("fs.s3.awsAccessKeyId", id);
-    }
-    if (cmdLine.hasOption(s3SecretOpt)) {
-      String secret = (String) cmdLine.getValue(s3SecretOpt);
-      conf.set("fs.s3n.awsSecretAccessKey", secret);
-      conf.set("fs.s3.awsSecretAccessKey", secret);
-    }
-    // do not compute crc file when using local FS
-    conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
-    FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);
-
-    int chunkSize = 1024 * 1024 * Integer.parseInt((String) 
cmdLine.getValue(chunkSizeOpt));
-
-    int numChunks = Integer.MAX_VALUE;
-    if (cmdLine.hasOption(numChunksOpt)) {
-      numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
-    }
-
-    String header = "<mediawiki 
xmlns=\"http://www.mediawiki.org/xml/export-0.3/\"; "
-                    + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"; 
"
-                    + 
"xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
-                    + "http://www.mediawiki.org/xml/export-0.3.xsd\"; " + 
"version=\"0.3\" "
-                    + "xml:lang=\"en\">\n" + "  <siteinfo>\n" + 
"<sitename>Wikipedia</sitename>\n"
-                    + "    
<base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
-                    + "    <generator>MediaWiki 1.13alpha</generator>\n" + "   
 <case>first-letter</case>\n"
-                    + "    <namespaces>\n" + "      <namespace 
key=\"-2\">Media</namespace>\n"
-                    + "      <namespace key=\"-1\">Special</namespace>\n" + "  
    <namespace key=\"0\" />\n"
-                    + "      <namespace key=\"1\">Talk</namespace>\n"
-                    + "      <namespace key=\"2\">User</namespace>\n"
-                    + "      <namespace key=\"3\">User talk</namespace>\n"
-                    + "      <namespace key=\"4\">Wikipedia</namespace>\n"
-                    + "      <namespace key=\"5\">Wikipedia talk</namespace>\n"
-                    + "      <namespace key=\"6\">Image</namespace>\n"
-                    + "      <namespace key=\"7\">Image talk</namespace>\n"
-                    + "      <namespace key=\"8\">MediaWiki</namespace>\n"
-                    + "      <namespace key=\"9\">MediaWiki talk</namespace>\n"
-                    + "      <namespace key=\"10\">Template</namespace>\n"
-                    + "      <namespace key=\"11\">Template talk</namespace>\n"
-                    + "      <namespace key=\"12\">Help</namespace>\n"
-                    + "      <namespace key=\"13\">Help talk</namespace>\n"
-                    + "      <namespace key=\"14\">Category</namespace>\n"
-                    + "      <namespace key=\"15\">Category talk</namespace>\n"
-                    + "      <namespace key=\"100\">Portal</namespace>\n"
-                    + "      <namespace key=\"101\">Portal talk</namespace>\n" 
+ "    </namespaces>\n"
-                    + "  </siteinfo>\n";
-
-    StringBuilder content = new StringBuilder();
-    content.append(header);
-    NumberFormat decimalFormatter = new DecimalFormat("0000");
-    File dumpFile = new File(dumpFilePath);
-
-    // If the specified path for the input file is incorrect, return 
immediately
-    if (!dumpFile.exists()) {
-      log.error("Input file path {} doesn't exist", dumpFilePath);
-      return;
-    }
-
-    FileLineIterator it;
-    if (dumpFilePath.endsWith(".bz2")) {
-      // default compression format from http://download.wikimedia.org
-      CompressionCodec codec = new BZip2Codec();
-      it = new FileLineIterator(codec.createInputStream(new 
FileInputStream(dumpFile)));
-    } else {
-      // assume the user has previously de-compressed the dump file
-      it = new FileLineIterator(dumpFile);
-    }
-    int fileNumber = 0;
-    while (it.hasNext()) {
-      String thisLine = it.next();
-      if (thisLine.trim().startsWith("<page>")) {
-        boolean end = false;
-        while (!thisLine.trim().startsWith("</page>")) {
-          content.append(thisLine).append('\n');
-          if (it.hasNext()) {
-            thisLine = it.next();
-          } else {
-            end = true;
-            break;
-          }
-        }
-        content.append(thisLine).append('\n');
-
-        if (content.length() > chunkSize || end) {
-          content.append("</mediawiki>");
-          fileNumber++;
-          String filename = outputDirPath + "/chunk-" + 
decimalFormatter.format(fileNumber) + ".xml";
-          try (BufferedWriter chunkWriter =
-                   new BufferedWriter(new OutputStreamWriter(fs.create(new 
Path(filename)), "UTF-8"))) {
-            chunkWriter.write(content.toString(), 0, content.length());
-          }
-          if (fileNumber >= numChunks) {
-            break;
-          }
-          content = new StringBuilder();
-          content.append(header);
-        }
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java
 
b/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java
deleted file mode 100644
index afd350f..0000000
--- 
a/integration/src/main/java/org/apache/mahout/text/wikipedia/XmlInputFormat.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text.wikipedia;
-
-import com.google.common.io.Closeables;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-
-/**
- * Reads records that are delimited by a specific begin/end tag.
- */
-public class XmlInputFormat extends TextInputFormat {
-
-  private static final Logger log = 
LoggerFactory.getLogger(XmlInputFormat.class);
-
-  public static final String START_TAG_KEY = "xmlinput.start";
-  public static final String END_TAG_KEY = "xmlinput.end";
-
-  @Override
-  public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, 
TaskAttemptContext context) {
-    try {
-      return new XmlRecordReader((FileSplit) split, 
context.getConfiguration());
-    } catch (IOException ioe) {
-      log.warn("Error while creating XmlRecordReader", ioe);
-      return null;
-    }
-  }
-
-  /**
-   * XMLRecordReader class to read through a given xml document to output xml 
blocks as records as specified
-   * by the start tag and end tag
-   * 
-   */
-  public static class XmlRecordReader extends RecordReader<LongWritable, Text> 
{
-
-    private final byte[] startTag;
-    private final byte[] endTag;
-    private final long start;
-    private final long end;
-    private final FSDataInputStream fsin;
-    private final DataOutputBuffer buffer = new DataOutputBuffer();
-    private LongWritable currentKey;
-    private Text currentValue;
-
-    public XmlRecordReader(FileSplit split, Configuration conf) throws 
IOException {
-      startTag = conf.get(START_TAG_KEY).getBytes(Charsets.UTF_8);
-      endTag = conf.get(END_TAG_KEY).getBytes(Charsets.UTF_8);
-
-      // open the file and seek to the start of the split
-      start = split.getStart();
-      end = start + split.getLength();
-      Path file = split.getPath();
-      FileSystem fs = file.getFileSystem(conf);
-      fsin = fs.open(split.getPath());
-      fsin.seek(start);
-    }
-
-    private boolean next(LongWritable key, Text value) throws IOException {
-      if (fsin.getPos() < end && readUntilMatch(startTag, false)) {
-        try {
-          buffer.write(startTag);
-          if (readUntilMatch(endTag, true)) {
-            key.set(fsin.getPos());
-            value.set(buffer.getData(), 0, buffer.getLength());
-            return true;
-          }
-        } finally {
-          buffer.reset();
-        }
-      }
-      return false;
-    }
-
-    @Override
-    public void close() throws IOException {
-      Closeables.close(fsin, true);
-    }
-
-    @Override
-    public float getProgress() throws IOException {
-      return (fsin.getPos() - start) / (float) (end - start);
-    }
-
-    private boolean readUntilMatch(byte[] match, boolean withinBlock) throws 
IOException {
-      int i = 0;
-      while (true) {
-        int b = fsin.read();
-        // end of file:
-        if (b == -1) {
-          return false;
-        }
-        // save to buffer:
-        if (withinBlock) {
-          buffer.write(b);
-        }
-
-        // check if we're matching:
-        if (b == match[i]) {
-          i++;
-          if (i >= match.length) {
-            return true;
-          }
-        } else {
-          i = 0;
-        }
-        // see if we've passed the stop point:
-        if (!withinBlock && i == 0 && fsin.getPos() >= end) {
-          return false;
-        }
-      }
-    }
-
-    @Override
-    public LongWritable getCurrentKey() throws IOException, 
InterruptedException {
-      return currentKey;
-    }
-
-    @Override
-    public Text getCurrentValue() throws IOException, InterruptedException {
-      return currentValue;
-    }
-
-    @Override
-    public void initialize(InputSplit split, TaskAttemptContext context) 
throws IOException, InterruptedException {
-    }
-
-    @Override
-    public boolean nextKeyValue() throws IOException, InterruptedException {
-      currentKey = new LongWritable();
-      currentValue = new Text();
-      return next(currentKey, currentValue);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/Bump125.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/Bump125.java 
b/integration/src/main/java/org/apache/mahout/utils/Bump125.java
deleted file mode 100644
index 1c55090..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/Bump125.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-/**
- * Helps with making nice intervals at arbitrary scale.
- *
- * One use case is where we are producing progress or error messages every 
time an incoming
- * record is received.  It is generally bad form to produce a message for 
<i>every</i> input
- * so it would be better to produce a message for each of the first 10 
records, then every
- * other record up to 20 and then every 5 records up to 50 and then every 10 
records up to 100,
- * more or less. The pattern can now repeat scaled up by 100.  The total 
number of messages will scale
- * with the log of the number of input lines which is much more survivable 
than direct output
- * and because early records all get messages, we get indications early.
- */
-public class Bump125 {
-  private static final int[] BUMPS = {1, 2, 5};
-
-  static int scale(double value, double base) {
-    double scale = value / base;
-    // scan for correct step
-    int i = 0;
-    while (i < BUMPS.length - 1 && BUMPS[i + 1] <= scale) {
-      i++;
-    }
-    return BUMPS[i];
-  }
-
-  static long base(double value) {
-    return Math.max(1, (long) Math.pow(10, (int) 
Math.floor(Math.log10(value))));
-  }
-
-  private long counter = 0;
-
-  public long increment() {
-    long delta;
-    if (counter >= 10) {
-      long base = base(counter / 4.0);
-      int scale = scale(counter / 4.0, base);
-      delta = base * scale;
-    } else {
-      delta = 1;
-    }
-    counter += delta;
-    return counter;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java 
b/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
deleted file mode 100644
index f63de83..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.PrintStream;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import 
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.MatrixWritable;
-
-/**
- * Export a Matrix in various text formats:
- *    * CSV file
- * 
- * Input format: Hadoop SequenceFile with Text key and MatrixWritable value, 1 
pair
- * TODO:
- *     Needs class for key value- should not hard-code to Text.
- *     Options for row and column headers- stats software can be picky.
- * Assumes only one matrix in a file.
- */
-public final class MatrixDumper extends AbstractJob {
-  
-  private MatrixDumper() { }
-  
-  public static void main(String[] args) throws Exception {
-    ToolRunner.run(new MatrixDumper(), args);
-  }
-  
-  @Override
-  public int run(String[] args) throws Exception {
-    
-    addInputOption();
-    addOption("output", "o", "Output path", null); // AbstractJob output 
feature requires param
-    Map<String, List<String>> parsedArgs = parseArguments(args);
-    if (parsedArgs == null) {
-      return -1;
-    }
-    String outputFile = hasOption("output") ? getOption("output") : null;
-    exportCSV(getInputPath(), outputFile, false);
-    return 0;
-  }
-  
-  private static void exportCSV(Path inputPath, String outputFile, boolean 
doLabels) throws IOException {
-    SequenceFileValueIterator<MatrixWritable> it =
-        new SequenceFileValueIterator<>(inputPath, true, new Configuration());
-    Matrix m = it.next().get();
-    it.close();
-    PrintStream ps = getPrintStream(outputFile);
-    String[] columnLabels = getLabels(m.numCols(), m.getColumnLabelBindings(), 
"col");
-    String[] rowLabels = getLabels(m.numRows(), m.getRowLabelBindings(), 
"row");
-    if (doLabels) {
-      ps.print("rowid,");
-      ps.print(columnLabels[0]);
-      for (int c = 1; c < m.numCols(); c++) {
-        ps.print(',' + columnLabels[c]);
-      }
-      ps.println();
-    }
-    for (int r = 0; r < m.numRows(); r++) {
-      if (doLabels) {
-        ps.print(rowLabels[0] + ',');
-      }
-      ps.print(Double.toString(m.getQuick(r,0)));
-      for (int c = 1; c < m.numCols(); c++) {
-        ps.print(",");
-        ps.print(Double.toString(m.getQuick(r,c)));
-      }
-      ps.println();
-    }
-    if (ps != System.out) {
-      ps.close();
-    }
-  }
-  
-  private static PrintStream getPrintStream(String outputPath) throws 
IOException {
-    if (outputPath == null) {
-      return System.out;
-    }
-    File outputFile = new File(outputPath);
-    if (outputFile.exists()) {
-      outputFile.delete();
-    }
-    outputFile.createNewFile();
-    OutputStream os = new FileOutputStream(outputFile);
-    return new PrintStream(os, false, Charsets.UTF_8.displayName());
-  }
-  
-  /**
-   * return the label set, sorted by matrix order
-   * if there are no labels, fabricate them using the starter string
-   * @param length 
-   */
-  private static String[] getLabels(int length, Map<String,Integer> labels, 
String start) {
-    if (labels != null) {
-      return sortLabels(labels);
-    }
-    String[] sorted = new String[length];
-    for (int i = 1; i <= length; i++) {
-      sorted[i] = start + i;
-    }
-    return sorted;
-  }
-  
-  private static String[] sortLabels(Map<String,Integer> labels) {
-    String[] sorted = new String[labels.size()];
-    for (Map.Entry<String,Integer> entry : labels.entrySet()) {
-      sorted[entry.getValue()] = entry.getKey();
-    }
-    return sorted;
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
----------------------------------------------------------------------
diff --git 
a/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java 
b/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
deleted file mode 100644
index e01868a..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.io.File;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.List;
-
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
-import org.apache.mahout.math.list.IntArrayList;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-
-public final class SequenceFileDumper extends AbstractJob {
-
-  public SequenceFileDumper() {
-    setConf(new Configuration());
-  }
-
-  @Override
-  public int run(String[] args) throws Exception {
-
-    addInputOption();
-    addOutputOption();
-    addOption("substring", "b", "The number of chars to print out per value", 
false);
-    addOption(buildOption("count", "c", "Report the count only", false, false, 
null));
-    addOption("numItems", "n", "Output at most <n> key value pairs", false);
-    addOption(buildOption("facets", "fa", "Output the counts per key.  Note, 
if there are a lot of unique keys, "
-        + "this can take up a fair amount of memory", false, false, null));
-    addOption(buildOption("quiet", "q", "Print only file contents.", false, 
false, null));
-
-    if (parseArguments(args, false, true) == null) {
-      return -1;
-    }
-
-    Path[] pathArr;
-    Configuration conf = new Configuration();
-    Path input = getInputPath();
-    FileSystem fs = input.getFileSystem(conf);
-    if (fs.getFileStatus(input).isDir()) {
-      pathArr = FileUtil.stat2Paths(fs.listStatus(input, 
PathFilters.logsCRCFilter()));
-    } else {
-      pathArr = new Path[1];
-      pathArr[0] = input;
-    }
-
-
-    Writer writer;
-    boolean shouldClose;
-    if (hasOption("output")) {
-      shouldClose = true;
-      writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8);
-    } else {
-      shouldClose = false;
-      writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
-    }
-    try {
-      for (Path path : pathArr) {
-        if (!hasOption("quiet")) {
-          writer.append("Input Path: 
").append(String.valueOf(path)).append('\n');
-        }
-
-        int sub = Integer.MAX_VALUE;
-        if (hasOption("substring")) {
-          sub = Integer.parseInt(getOption("substring"));
-        }
-        boolean countOnly = hasOption("count");
-        SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<>(path, 
true, conf);
-        if (!hasOption("quiet")) {
-          writer.append("Key class: 
").append(iterator.getKeyClass().toString());
-          writer.append(" Value Class: 
").append(iterator.getValueClass().toString()).append('\n');
-        }
-        OpenObjectIntHashMap<String> facets = null;
-        if (hasOption("facets")) {
-          facets = new OpenObjectIntHashMap<>();
-        }
-        long count = 0;
-        if (countOnly) {
-          while (iterator.hasNext()) {
-            Pair<?, ?> record = iterator.next();
-            String key = record.getFirst().toString();
-            if (facets != null) {
-              facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
-            }
-            count++;
-          }
-          writer.append("Count: ").append(String.valueOf(count)).append('\n');
-        } else {
-          long numItems = Long.MAX_VALUE;
-          if (hasOption("numItems")) {
-            numItems = Long.parseLong(getOption("numItems"));
-            if (!hasOption("quiet")) {
-              writer.append("Max Items to dump: 
").append(String.valueOf(numItems)).append("\n");
-            }
-          }
-          while (iterator.hasNext() && count < numItems) {
-            Pair<?, ?> record = iterator.next();
-            String key = record.getFirst().toString();
-            writer.append("Key: ").append(key);
-            String str = record.getSecond().toString();
-            writer.append(": Value: ").append(str.length() > sub 
-                                              ? str.substring(0, sub) : str);
-            writer.write('\n');
-            if (facets != null) {
-              facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
-            }
-            count++;
-          }
-          if (!hasOption("quiet")) {
-            writer.append("Count: 
").append(String.valueOf(count)).append('\n');
-          }
-        }
-        if (facets != null) {
-          List<String> keyList = new ArrayList<>(facets.size());
-
-          IntArrayList valueList = new IntArrayList(facets.size());
-          facets.pairsSortedByKey(keyList, valueList);
-          writer.append("-----Facets---\n");
-          writer.append("Key\t\tCount\n");
-          int i = 0;
-          for (String key : keyList) {
-            
writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
-          }
-        }
-      }
-      writer.flush();
-
-    } finally {
-      if (shouldClose) {
-        Closeables.close(writer, false);
-      }
-    }
-
-
-    return 0;
-  }
-
-  public static void main(String[] args) throws Exception {
-    new SequenceFileDumper().run(args);
-  }
-
-}

[30/51] [partial] mahout git commit: MAHOUT-2042 and MAHOUT-2045 Delete directories which were moved/no longer in use

Reply via email to