Author: gsingers
Date: Sat Nov 5 17:11:57 2011
New Revision: 1197992
URL: http://svn.apache.org/viewvc?rev=1197992&view=rev
Log:
MAHOUT-403: add in some regex transformation capabilities for converting raw
content
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
mahout/trunk/src/conf/driver.classes.props
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java?rev=1197992&r1=1197991&r2=1197992&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
Sat Nov 5 17:11:57 2011
@@ -47,7 +47,9 @@ import org.apache.hadoop.mapreduce.lib.i
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.vectorizer.DefaultAnalyzer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -448,4 +450,15 @@ public abstract class AbstractJob extend
FileInputFormat.setInputPaths(job, inputPathOne.makeQualified(fs),
inputPathTwo.makeQualified(fs));
}
+ protected Class<? extends Analyzer> getAnalyzerClassFromOption() throws
ClassNotFoundException {
+ Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
+ if (hasOption(DefaultOptionCreator.ANALYZER_NAME_OPTION)) {
+ String className =
getOption(DefaultOptionCreator.ANALYZER_NAME_OPTION).toString();
+ analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
+ // try instantiating it, b/c there isn't any point in setting it if
+ // you can't instantiate it
+ ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
+ }
+ return analyzerClass;
+ }
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=1197992&r1=1197991&r2=1197992&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
Sat Nov 5 17:11:57 2011
@@ -23,6 +23,7 @@ import org.apache.commons.cli2.builder.D
import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
import org.apache.mahout.common.kernel.TriangularKernelProfile;
+import org.apache.mahout.vectorizer.DefaultAnalyzer;
public final class DefaultOptionCreator {
@@ -67,6 +68,8 @@ public final class DefaultOptionCreator
public static final String MAPREDUCE_METHOD = "mapreduce";
public static final String KERNEL_PROFILE_OPTION = "kernelProfile";
+
+ public static final String ANALYZER_NAME_OPTION = "analyzerName";
private DefaultOptionCreator() {}
@@ -321,6 +324,24 @@ public static DefaultOptionBuilder clust
"If present, run clustering after the iterations have taken place")
.withShortName("cl");
}
+
+ /**
+ * Returns a default command line option for specifying a Lucene analyzer
class
+ * @return {@link DefaultOptionBuilder}
+ */
+ public static DefaultOptionBuilder analyzerOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(ANALYZER_NAME_OPTION)
+ .withRequired(false)
+ .withDescription(
+ "If present, the name of a Lucene analyzer class to use")
+ .withArgument(
+ new
ArgumentBuilder().withName(ANALYZER_NAME_OPTION).withDefault(DefaultAnalyzer.class.getName())
+ .withMinimum(1).withMaximum(1).create()
+ )
+ .withShortName("an");
+ }
+
/**
* Returns a default command line option for specifying the emitMostLikely
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java?rev=1197992&r1=1197991&r2=1197992&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
Sat Nov 5 17:11:57 2011
@@ -45,7 +45,7 @@ public final class EncodedVectorsFromSeq
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
- addOption("analyzerName", "an", "The class name of the analyzer",
DefaultAnalyzer.class.getName());
+ addOption(DefaultOptionCreator.analyzerOption().create());
addOption(buildOption("sequentialAccessVector", "seq", "(Optional) Whether
output vectors should be SequentialAccessVectors. If set true else false",
false, false, null));
addOption(buildOption("namedVector", "nv", "Create named vectors using the
key. False by default", false, false, null));
addOption("cardinality", "c", "The cardinality to use for creating the
vectors. Default is 5000", String.valueOf(5000));
@@ -63,14 +63,7 @@ public final class EncodedVectorsFromSeq
HadoopUtil.delete(getConf(), output);
}
- Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
- if (hasOption("analyzerName")) {
- String className = getOption("analyzerName").toString();
- analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
- // try instantiating it, b/c there isn't any point in setting it if
- // you can't instantiate it
- ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
- }
+ Class<? extends Analyzer> analyzerClass = getAnalyzerClassFromOption();
Configuration conf = getConf();
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,58 @@
+package org.apache.mahout.utils.regex;
+
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+import org.apache.mahout.common.lucene.TokenStreamIterator;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ *
+ *
+ **/
+public class AnalyzerTransformer implements RegexTransformer {
+ protected Analyzer analyzer;
+ protected String fieldName = "text";
+
+ public AnalyzerTransformer() {
+ this(new StandardAnalyzer(Version.LUCENE_34), "text");
+ }
+
+ public AnalyzerTransformer(Analyzer analyzer) {
+ this(analyzer, "text");
+ }
+
+ public AnalyzerTransformer(Analyzer analyzer, String fieldName) {
+ this.analyzer = analyzer;
+ this.fieldName = fieldName;
+ }
+
+ @Override
+ public String transformMatch(String match) {
+ StringBuilder result = new StringBuilder();
+ try {
+ TokenStream ts = analyzer.reusableTokenStream(fieldName, new
StringReader(match));
+ ts.addAttribute(CharTermAttribute.class);
+ TokenStreamIterator iter = new TokenStreamIterator(ts);
+ while (iter.hasNext()) {
+ result.append(iter.next()).append(" ");
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return result.toString();
+ }
+
+ public Analyzer getAnalyzer() {
+ return analyzer;
+ }
+
+ public void setAnalyzer(Analyzer analyzer) {
+ this.analyzer = analyzer;
+ }
+}
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/ChainTransformer.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,39 @@
+package org.apache.mahout.utils.regex;
+
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Chain together several {@link
org.apache.mahout.utils.regex.RegexTransformer} and apply them to the match
+ * in succession
+ *
+ **/
+public class ChainTransformer implements RegexTransformer {
+
+ private List<RegexTransformer> chain = new ArrayList<RegexTransformer>();
+
+ public ChainTransformer() {
+ }
+
+ public ChainTransformer(List<RegexTransformer> chain) {
+ this.chain = chain;
+ }
+
+ @Override
+ public String transformMatch(String match) {
+ String result = match;
+ for (RegexTransformer transformer : chain) {
+ result = transformer.transformMatch(result);
+ }
+ return result;
+ }
+
+ public List<RegexTransformer> getChain() {
+ return chain;
+ }
+
+ public void setChain(List<RegexTransformer> chain) {
+ this.chain = chain;
+ }
+}
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/FPGFormatter.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,32 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.util.regex.Pattern;
+
+/**
+ * Collapses/converts all whitespace to a single tab
+ *
+ **/
+public class FPGFormatter implements RegexFormatter {
+ private static final Pattern WHITESPACE = Pattern.compile("\\W+");
+ @Override
+ public String format(String toFormat) {
+ return "\t" + WHITESPACE.matcher(toFormat).replaceAll("|");
+ }
+}
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityFormatter.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,14 @@
+package org.apache.mahout.utils.regex;
+
+
+/**
+ *
+ *
+ **/
+public class IdentityFormatter implements RegexFormatter {
+
+ @Override
+ public String format(String toFormat) {
+ return toFormat;
+ }
+}
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/IdentityTransformer.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,28 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * No-op
+ */
+public class IdentityTransformer implements RegexTransformer {
+ @Override
+ public String transformMatch(String match) {
+ return match;
+ }
+}
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexConverterDriver.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,103 @@
+package org.apache.mahout.utils.regex;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.vectorizer.DefaultAnalyzer;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Experimental
+ */
+public class RegexConverterDriver extends AbstractJob {
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addOption("regex", "regex",
+ "The regular expression to use", true);
+ addOption("groupsToKeep", "g",
+ "The number of the capturing groups to keep", false);
+ addOption("transformerClass", "t",
+ "The optional class specifying the Regex Transformer", false);
+ addOption("formatterClass", "t",
+ "The optional class specifying the Regex Formatter", false);
+ addOption(DefaultOptionCreator.analyzerOption().create());
+
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ Configuration conf = getConf();
+ //TODO: How to deal with command line escaping?
+ conf.set(RegexMapper.REGEX, getOption("regex"));//
+ String gtk = getOption("groupsToKeep");
+ if (gtk != null) {
+ conf.set(RegexMapper.GROUP_MATCHERS, gtk);
+ }
+ String trans = getOption("transformerClass");
+ if (trans != null) {
+ if (trans.equalsIgnoreCase("url")) {
+ trans = URLDecodeTransformer.class.getName();
+ }
+ conf.set(RegexMapper.TRANSFORMER_CLASS, trans);
+ }
+ String formatter = getOption("formatterClass");
+ if (formatter != null) {
+ if (formatter.equalsIgnoreCase("fpg")) {
+ formatter = FPGFormatter.class.getName();
+ }
+ conf.set(RegexMapper.FORMATTER_CLASS, formatter);
+ }
+ Path input = getInputPath();
+ Path output = getOutputPath();
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), output);
+ }
+ Class<? extends Analyzer> analyzerClass = getAnalyzerClassFromOption();
+ if (analyzerClass != null) {
+ conf.set(RegexMapper.ANALYZER_NAME, analyzerClass.getName());
+ }
+ Job job = prepareJob(input, output,
+ TextInputFormat.class,
+ RegexMapper.class,
+ LongWritable.class,
+ Text.class,
+ TextOutputFormat.class);
+ job.waitForCompletion(true);
+
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new RegexConverterDriver(), args);
+ }
+
+}
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexFormatter.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,26 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ *
+ *
+ **/
+public interface RegexFormatter {
+ public String format(String toFormat);
+}
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexMapper.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,82 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.mahout.common.ClassUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+
+/**
+ *
+ *
+ **/
+public class RegexMapper extends Mapper<LongWritable, Text, LongWritable,
Text> {
+ public static final String REGEX = "regex";
+ public static final String GROUP_MATCHERS = "regex.groups";
+ public static final String TRANSFORMER_CLASS = "transformer.class";
+ public static final String FORMATTER_CLASS = "formatter.class";
+
+ private Pattern regex;
+ private List<Integer> groupsToKeep;
+ private RegexTransformer transformer = RegexUtils.IDENTITY_TRANSFORMER;
+ private RegexFormatter formatter = RegexUtils.IDENTITY_FORMATTER;
+ public static final String ANALYZER_NAME = "analyzerName";
+
+
+ @Override
+ protected void setup(Context context) throws IOException,
InterruptedException {
+ groupsToKeep = new ArrayList<Integer>();
+ Configuration config = context.getConfiguration();
+ String regexStr = config.get(REGEX);
+ regex = Pattern.compile(regexStr);
+ String[] groups = config.getStrings(GROUP_MATCHERS);
+ if (groups != null) {
+ for (int i = 0; i < groups.length; i++) {
+ groupsToKeep.add(Integer.parseInt(groups[i]));
+ }
+ }
+
+ transformer = ClassUtils.instantiateAs(config.get(TRANSFORMER_CLASS,
IdentityTransformer.class.getName()), RegexTransformer.class);
+ String analyzerName = config.get(ANALYZER_NAME);
+ if (analyzerName != null && transformer instanceof AnalyzerTransformer) {
+ Analyzer analyzer = ClassUtils.instantiateAs(analyzerName,
Analyzer.class);
+ ((AnalyzerTransformer)transformer).setAnalyzer(analyzer);
+ }
+
+ formatter = ClassUtils.instantiateAs(config.get(FORMATTER_CLASS,
IdentityFormatter.class.getName()), RegexFormatter.class);
+
+ }
+
+
+ @Override
+ protected void map(LongWritable key, Text text, Context context) throws
IOException, InterruptedException {
+ String result = RegexUtils.extract(text.toString(), regex, groupsToKeep, "
", transformer);
+ if (result != null && result.length() > 0) {
+ String format = formatter.format(result);
+ context.write(key, new Text(format));
+ }
+ }
+}
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexTransformer.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,26 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Transforms the match of a regular expression.
+ */
+public interface RegexTransformer {
+ public String transformMatch(String match);
+
+}
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/RegexUtils.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,69 @@
+package org.apache.mahout.utils.regex;
+
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ *
+ *
+ **/
+public class RegexUtils {
+ public static final RegexTransformer IDENTITY_TRANSFORMER = new
IdentityTransformer();
+ public static final RegexFormatter IDENTITY_FORMATTER = new
IdentityFormatter();
+
+ public static String extract(String line, Pattern pattern, List<Integer>
groupsToKeep,
+ String separator, RegexTransformer transformer)
{
+ StringBuilder bldr = new StringBuilder();
+ extract(line, bldr, pattern, groupsToKeep, separator, transformer);
+ return bldr.toString();
+ }
+
+ public static void extract(String line, StringBuilder outputBuffer,
+ Pattern pattern, List<Integer> groupsToKeep,
String separator,
+ RegexTransformer transformer) {
+ if (transformer == null) {
+ transformer = IDENTITY_TRANSFORMER;
+ }
+ Matcher matcher = pattern.matcher(line);
+ String match;
+ if (groupsToKeep.isEmpty() == false) {
+ while (matcher.find() == true) {
+ for (Integer groupNum : groupsToKeep) {
+ match = matcher.group(groupNum);
+ if (match != null) {
+
outputBuffer.append(transformer.transformMatch(match)).append(separator);
+ }
+ }
+ }
+ } else {
+ while (matcher.find() == true) {
+ match = matcher.group();
+ if (match != null) {
+
outputBuffer.append(transformer.transformMatch(match)).append(separator);
+ }
+ }
+ }
+ //trim off the last separator, which is always there
+ if (outputBuffer.length() > 0) {
+ outputBuffer.setLength(outputBuffer.length() - separator.length());
+ }
+ }
+}
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/regex/URLDecodeTransformer.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,46 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+
+
+/**
+ *
+ *
+ **/
+public class URLDecodeTransformer implements RegexTransformer {
+ private String enc;
+
+ public URLDecodeTransformer() {
+ enc = "UTF-8";
+ }
+
+ public URLDecodeTransformer(String encoding) {
+ this.enc = encoding;
+ }
+
+ @Override
+ public String transformMatch(String match) {
+ try {
+ return URLDecoder.decode(match, enc);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
Added:
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
(added)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,114 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.DummyRecordWriter;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.util.List;
+
+
+/**
+ *
+ *
+ **/
+
+public class RegexMapperTest extends MahoutTestCase {
+
+
+ @Test
+ public void testRegex() throws Exception {
+ RegexMapper mapper = new RegexMapper();
+ Configuration conf = new Configuration();
+ conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
+ conf.set(RegexMapper.TRANSFORMER_CLASS,
URLDecodeTransformer.class.getName());
+ //conf.set(RegexMapper.);
+ DummyRecordWriter<LongWritable, Text> mapWriter = new
DummyRecordWriter<LongWritable, Text>();
+ Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext =
DummyRecordWriter
+ .build(mapper, conf, mapWriter);
+
+ mapper.setup(mapContext);
+ for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+ String testStr = RegexUtilsTest.TEST_STRS[i];
+
+ LongWritable key = new LongWritable(i);
+ mapper.map(key, new Text(testStr), mapContext);
+ List<Text> value = mapWriter.getValue(key);
+ if (RegexUtilsTest.GOLD[i].equals("") == false) {
+ assertEquals(1, value.size());
+ assertEquals(RegexUtilsTest.GOLD[i], value.get(0).toString());
+ }
+ }
+ }
+
+ @Test
+ public void testGroups() throws Exception {
+ RegexMapper mapper = new RegexMapper();
+ Configuration conf = new Configuration();
+ conf.set(RegexMapper.REGEX, "(\\d+)\\.(\\d+)\\.(\\d+)");
+ conf.set(RegexMapper.TRANSFORMER_CLASS,
URLDecodeTransformer.class.getName());
+ conf.setStrings(RegexMapper.GROUP_MATCHERS, "1", "3");
+ //conf.set(RegexMapper.);
+ DummyRecordWriter<LongWritable, Text> mapWriter = new
DummyRecordWriter<LongWritable, Text>();
+ Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext =
DummyRecordWriter
+ .build(mapper, conf, mapWriter);
+
+ mapper.setup(mapContext);
+ for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+ String testStr = RegexUtilsTest.TEST_STRS[i];
+
+ LongWritable key = new LongWritable(i);
+ mapper.map(key, new Text(testStr), mapContext);
+ List<Text> value = mapWriter.getValue(key);
+ String gold = "127 0";
+ assertEquals(1, value.size());
+ assertEquals(gold, value.get(0).toString());
+ }
+ }
+
+ @Test
+ public void testFPGFormatter() throws Exception {
+ RegexMapper mapper = new RegexMapper();
+ Configuration conf = new Configuration();
+ conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)");
+ conf.set(RegexMapper.TRANSFORMER_CLASS,
URLDecodeTransformer.class.getName());
+ conf.set(RegexMapper.FORMATTER_CLASS, FPGFormatter.class.getName());
+ //conf.set(RegexMapper.);
+ DummyRecordWriter<LongWritable, Text> mapWriter = new
DummyRecordWriter<LongWritable, Text>();
+ Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext =
DummyRecordWriter
+ .build(mapper, conf, mapWriter);
+
+ mapper.setup(mapContext);
+ FPGFormatter formatter = new FPGFormatter();
+ for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) {
+ String testStr = RegexUtilsTest.TEST_STRS[i];
+
+ LongWritable key = new LongWritable(i);
+ mapper.map(key, new Text(testStr), mapContext);
+ List<Text> value = mapWriter.getValue(key);
+ if (RegexUtilsTest.GOLD[i].equals("") == false) {
+ assertEquals(1, value.size());
+ assertEquals(formatter.format(RegexUtilsTest.GOLD[i]),
value.get(0).toString());
+ }
+ }
+ }
+}
Added:
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java?rev=1197992&view=auto
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
(added)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java
Sat Nov 5 17:11:57 2011
@@ -0,0 +1,67 @@
+package org.apache.mahout.utils.regex;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Pattern;
+
+
+/**
+ *
+ *
+ **/
+public class RegexUtilsTest extends MahoutTestCase {
+ public static final String[] TEST_STRS = new String[]{
+ "127.0.0.1 - - [01/10/2011:00:01:51 +0000] \"GET
/solr/collection1/browse?q=foo&rows=10&wt=json&hl=true&hl.fl=body&hl.fl=content",
+ "127.0.0.1 - - [01/10/2011:00:20:58 +0000] \"GET
/solr/collection1/browse?q=Using+Solr+Search+RDBMS&fq=%7B%21tag%3Dsource%7D%28%28source%3Alucid+AND+lucid_facet%3A%28site%29%29%29&rows=10",
+ "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET
/solr/collection1/browse?q=language+detection&start=560&rows=10 HTTP/1.1\" 200
45071",
+ "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET
/solr/collection1/browse?q=&start=560&rows=10 HTTP/1.1\" 200 45071"
+ };
+ public static final String[] GOLD = new String[]{"foo", "Using Solr Search
RDBMS", "language detection", ""};
+
+ @Test
+ public void testExtract() throws Exception {
+ String line = "127.0.0.1 - - [24/05/2010:01:19:22 +0000] \"GET
/solr/select?q=import statement&start=1 HTTP/1.1\" 200 37571";
+ String res;
+ Pattern pattern;
+ pattern = Pattern.compile("(?<=(\\?|&)q=).*?(?=&|$)");
+ res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(),
" ", RegexUtils.IDENTITY_TRANSFORMER);
+ assertTrue(res, res.equals("import statement"));
+
+ for (int i = 0; i < TEST_STRS.length; i++) {
+ String testStr = TEST_STRS[i];
+ res = RegexUtils.extract(testStr, pattern,
Collections.<Integer>emptyList(), " ", new URLDecodeTransformer());
+ assertEquals(GOLD[i], res);
+ }
+
+ pattern =
Pattern.compile("((?<=(\\?|&)q=)(.*?)(?=(&|$))|(?<=((\\?|&)start=))(\\d+))");
+ res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(),
" ", RegexUtils.IDENTITY_TRANSFORMER);
+ assertTrue(res, res.equals("import statement 1"));
+
+ pattern = Pattern.compile("(start=1) HTTP");
+ List<Integer> groupsToKeep = new ArrayList<Integer>();
+ groupsToKeep.add(1);
+ res = RegexUtils.extract(line, pattern, groupsToKeep, " ",
RegexUtils.IDENTITY_TRANSFORMER);
+ assertTrue(res, res.equals("start=1"));
+ }
+}
Modified: mahout/trunk/src/conf/driver.classes.props
URL:
http://svn.apache.org/viewvc/mahout/trunk/src/conf/driver.classes.props?rev=1197992&r1=1197991&r2=1197992&view=diff
==============================================================================
--- mahout/trunk/src/conf/driver.classes.props (original)
+++ mahout/trunk/src/conf/driver.classes.props Sat Nov 5 17:11:57 2011
@@ -7,6 +7,7 @@ org.apache.mahout.utils.vectors.arff.Dri
org.apache.mahout.utils.vectors.RowIdJob = rowid : Map
SequenceFile<Text,VectorWritable> to {SequenceFile<IntWritable,VectorWritable>,
SequenceFile<IntWritable,Text>}
org.apache.mahout.utils.SplitInput = split : Split Input data into test and
train sets
org.apache.mahout.utils.MatrixDumper = matrixdump : Dump matrix in CSV format
+org.apache.mahout.utils.regex.RegexConverterDriver = regexconverter : Convert
text files on a per line basis based on regular expressions
org.apache.mahout.text.SequenceFilesFromDirectory = seqdirectory : Generate
sequence files (of Text) from a directory
org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles = seq2sparse:
Sparse Vector generation from Text sequence files
org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles = seq2encoded:
Encoded Sparse Vector generation from Text sequence files
@@ -34,25 +35,30 @@ org.apache.mahout.clustering.spectral.km
#Freq. Itemset Mining
org.apache.mahout.fpm.pfpgrowth.FPGrowthDriver = fpg : Frequent Pattern Growth
#Classification
+#old bayes
+org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups =
prepare20newsgroups : Reformat 20 newsgroups data
+org.apache.mahout.classifier.bayes.WikipediaXmlSplitter = wikipediaXMLSplitter
: Reads wikipedia data and creates ch
+org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorDriver =
wikipediaDataSetCreator : Splits data set of wikipedia wrt feature like country
org.apache.mahout.classifier.bayes.TestClassifier = testclassifier : Test the
text based Bayes Classifier
org.apache.mahout.classifier.bayes.TrainClassifier = trainclassifier : Train
the text based Bayes Classifier
-org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups =
prepare20newsgroups : Reformat 20 newsgroups data
+#new bayes
+org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob = trainnb
: Train the Vector-based Bayes classifier
+org.apache.mahout.classifier.naivebayes.test.TestNaiveBayesDriver = testnb :
Test the Vector-based Bayes classifier
+#SGD
org.apache.mahout.classifier.sgd.TrainLogistic = trainlogistic : Train a
logistic regression using stochastic gradient descent
org.apache.mahout.classifier.sgd.RunLogistic = runlogistic : Run a logistic
regression model against CSV data
org.apache.mahout.classifier.sgd.PrintResourceOrFile = cat : Print a file or
resource as the logistic regression models would see it
org.apache.mahout.classifier.sgd.TrainAdaptiveLogistic = trainAdaptiveLogistic
: Train an AdaptivelogisticRegression model
org.apache.mahout.classifier.sgd.ValidateAdaptiveLogistic =
validateAdaptiveLogistic : Validate an AdaptivelogisticRegression model against
hold-out data set
org.apache.mahout.classifier.sgd.RunAdaptiveLogistic = runAdaptiveLogistic :
Score new production data using a probably trained and validated
AdaptivelogisticRegression model
-org.apache.mahout.classifier.bayes.WikipediaXmlSplitter = wikipediaXMLSplitter
: Reads wikipedia data and creates ch
-org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorDriver =
wikipediaDataSetCreator : Splits data set of wikipedia wrt feature like country
+#HMM
org.apache.mahout.classifier.sequencelearning.hmm.BaumWelchTrainer = baumwelch
: Baum-Welch algorithm for unsupervised HMM training
org.apache.mahout.classifier.sequencelearning.hmm.ViterbiEvaluator = viterbi :
Viterbi decoding of hidden states from given output states sequence
org.apache.mahout.classifier.sequencelearning.hmm.RandomSequenceGenerator =
hmmpredict : Generate random sequence of observations by given HMM
-org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob = trainnb
: Train the Vector-based Bayes classifier
-org.apache.mahout.classifier.naivebayes.test.TestNaiveBayesDriver = testnb :
Test the Vector-based Bayes classifier
+#Classifier Utils
org.apache.mahout.classifier.ConfusionMatrixDumper = cmdump : Dump confusion
matrix in HTML or text formats
-
+#Recommenders
org.apache.mahout.cf.taste.hadoop.als.DatasetSplitter = splitDataset : split a
rating dataset into training and probe parts
org.apache.mahout.cf.taste.hadoop.als.FactorizationEvaluator =
evaluateFactorization : compute RMSE and MAE of a rating matrix factorization
against probes
org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob =
itemsimilarity : Compute the item-item-similarities for item-based
collaborative filtering