http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java b/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java deleted file mode 100644 index 7ffa690..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/SplitInputTest.java +++ /dev/null @@ -1,418 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils; - -import java.io.BufferedWriter; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.nio.charset.Charset; - -import com.google.common.io.Closeables; -import org.apache.commons.io.Charsets; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.classifier.ClassifierData; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable; -import org.apache.mahout.math.SequentialAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.math.map.OpenObjectIntHashMap; -import org.junit.Before; -import org.junit.Test; - -public final class SplitInputTest extends MahoutTestCase { - - private OpenObjectIntHashMap<String> countMap; - private Charset charset; - private FileSystem fs; - private Path tempInputFile; - private Path tempTrainingDirectory; - private Path tempTestDirectory; - private Path tempMapRedOutputDirectory; - private Path tempInputDirectory; - private Path tempSequenceDirectory; - private SplitInput si; - - @Override - @Before - public void setUp() throws Exception { - Configuration conf = getConfiguration(); - fs = FileSystem.get(conf); - - super.setUp(); - - countMap = new OpenObjectIntHashMap<>(); - - charset = Charsets.UTF_8; - tempSequenceDirectory = getTestTempFilePath("tmpsequence"); - tempInputFile = getTestTempFilePath("bayesinputfile"); - tempTrainingDirectory = getTestTempDirPath("bayestrain"); - tempTestDirectory = getTestTempDirPath("bayestest"); - tempMapRedOutputDirectory = new Path(getTestTempDirPath(), "mapRedOutput"); - tempInputDirectory = getTestTempDirPath("bayesinputdir"); - - si = new SplitInput(); - si.setTrainingOutputDirectory(tempTrainingDirectory); - si.setTestOutputDirectory(tempTestDirectory); - si.setInputDirectory(tempInputDirectory); - } - - private void writeMultipleInputFiles() throws IOException { - Writer writer = null; - String currentLabel = null; - try { - for (String[] entry : ClassifierData.DATA) { - if (!entry[0].equals(currentLabel)) { - currentLabel = entry[0]; - Closeables.close(writer, false); - - writer = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(tempInputDirectory, currentLabel)), - Charsets.UTF_8)); - } - countMap.adjustOrPutValue(currentLabel, 1, 1); - writer.write(currentLabel + '\t' + entry[1] + '\n'); - } - }finally { - Closeables.close(writer, false); - } - } - - private void writeSingleInputFile() throws IOException { - Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(tempInputFile), Charsets.UTF_8)); - try { - for (String[] entry : ClassifierData.DATA) { - writer.write(entry[0] + '\t' + entry[1] + '\n'); - } - } finally { - Closeables.close(writer, true); - } - } - - @Test - public void testSplitDirectory() throws Exception { - - writeMultipleInputFiles(); - - final int testSplitSize = 1; - si.setTestSplitSize(testSplitSize); - si.setCallback(new SplitInput.SplitCallback() { - @Override - public void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart) { - int trainingLines = countMap.get(inputFile.getName()) - testSplitSize; - assertSplit(fs, inputFile, charset, testSplitSize, trainingLines, tempTrainingDirectory, tempTestDirectory); - } - }); - - si.splitDirectory(tempInputDirectory); - } - - @Test - public void testSplitFile() throws Exception { - writeSingleInputFile(); - si.setTestSplitSize(2); - si.setCallback(new TestCallback(2, 10)); - si.splitFile(tempInputFile); - } - - @Test - public void testSplitFileLocation() throws Exception { - writeSingleInputFile(); - si.setTestSplitSize(2); - si.setSplitLocation(50); - si.setCallback(new TestCallback(2, 10)); - si.splitFile(tempInputFile); - } - - @Test - public void testSplitFilePct() throws Exception { - writeSingleInputFile(); - si.setTestSplitPct(25); - - si.setCallback(new TestCallback(3, 9)); - si.splitFile(tempInputFile); - } - - @Test - public void testSplitFilePctLocation() throws Exception { - writeSingleInputFile(); - si.setTestSplitPct(25); - si.setSplitLocation(50); - si.setCallback(new TestCallback(3, 9)); - si.splitFile(tempInputFile); - } - - @Test - public void testSplitFileRandomSelectionSize() throws Exception { - writeSingleInputFile(); - si.setTestRandomSelectionSize(5); - - si.setCallback(new TestCallback(5, 7)); - si.splitFile(tempInputFile); - } - - @Test - public void testSplitFileRandomSelectionPct() throws Exception { - writeSingleInputFile(); - si.setTestRandomSelectionPct(25); - - si.setCallback(new TestCallback(3, 9)); - si.splitFile(tempInputFile); - } - - /** - * Create a Sequencefile for testing consisting of IntWritable - * keys and VectorWritable values - * @param path path for test SequenceFile - * @param testPoints number of records in test SequenceFile - */ - private void writeVectorSequenceFile(Path path, int testPoints) throws IOException { - Path tempSequenceFile = new Path(path, "part-00000"); - Configuration conf = getConfiguration(); - IntWritable key = new IntWritable(); - VectorWritable value = new VectorWritable(); - try (SequenceFile.Writer writer = - SequenceFile.createWriter(fs, conf, tempSequenceFile, IntWritable.class, VectorWritable.class)) { - for (int i = 0; i < testPoints; i++) { - key.set(i); - Vector v = new SequentialAccessSparseVector(4); - v.assign(i); - value.set(v); - writer.append(key, value); - } - } - } - - /** - * Create a Sequencefile for testing consisting of IntWritable keys and Text values - * @param path path for test SequenceFile - * @param testPoints number of records in test SequenceFile - */ - private void writeTextSequenceFile(Path path, int testPoints) throws IOException { - Path tempSequenceFile = new Path(path, "part-00000"); - Configuration conf = getConfiguration(); - Text key = new Text(); - Text value = new Text(); - try (SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, tempSequenceFile, Text.class, Text.class)){ - for (int i = 0; i < testPoints; i++) { - key.set(Integer.toString(i)); - value.set("Line " + i); - writer.append(key, value); - } - } - } - - /** - * Display contents of a SequenceFile - * @param sequenceFilePath path to SequenceFile - */ - private void displaySequenceFile(Path sequenceFilePath) throws IOException { - for (Pair<?,?> record : new SequenceFileIterable<>(sequenceFilePath, true, getConfiguration())) { - System.out.println(record.getFirst() + "\t" + record.getSecond()); - } - } - - /** - * Determine number of records in a SequenceFile - * @param sequenceFilePath path to SequenceFile - * @return number of records - */ - private int getNumberRecords(Path sequenceFilePath) throws IOException { - int numberRecords = 0; - for (Object value : new SequenceFileValueIterable<>(sequenceFilePath, true, getConfiguration())) { - numberRecords++; - } - return numberRecords; - } - - /** - * Test map reduce version of split input with Text, Text key value - * pairs in input - */ - @Test - public void testSplitInputMapReduceText() throws Exception { - writeTextSequenceFile(tempSequenceDirectory, 1000); - testSplitInputMapReduce(1000); - } - - /** Test map reduce version of split input with Text, Text key value pairs in input called from command line */ - @Test - public void testSplitInputMapReduceTextCli() throws Exception { - writeTextSequenceFile(tempSequenceDirectory, 1000); - testSplitInputMapReduceCli(1000); - } - - /** - * Test map reduce version of split input with IntWritable, Vector key value - * pairs in input - */ - @Test - public void testSplitInputMapReduceVector() throws Exception { - writeVectorSequenceFile(tempSequenceDirectory, 1000); - testSplitInputMapReduce(1000); - } - - /** - * Test map reduce version of split input with IntWritable, Vector key value - * pairs in input called from command line - */ - @Test - public void testSplitInputMapReduceVectorCli() throws Exception { - writeVectorSequenceFile(tempSequenceDirectory, 1000); - testSplitInputMapReduceCli(1000); - } - - /** - * Test map reduce version of split input through CLI - */ - private void testSplitInputMapReduceCli(int numPoints) throws Exception { - int randomSelectionPct = 25; - int keepPct = 10; - String[] args = - { "--method", "mapreduce", "--input", tempSequenceDirectory.toString(), - "--mapRedOutputDir", tempMapRedOutputDirectory.toString(), - "--randomSelectionPct", Integer.toString(randomSelectionPct), - "--keepPct", Integer.toString(keepPct), "-ow" }; - ToolRunner.run(getConfiguration(), new SplitInput(), args); - validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct); - } - - /** - * Test map reduce version of split input through method call - */ - private void testSplitInputMapReduce(int numPoints) throws Exception { - int randomSelectionPct = 25; - si.setTestRandomSelectionPct(randomSelectionPct); - int keepPct = 10; - si.setKeepPct(keepPct); - si.setMapRedOutputDirectory(tempMapRedOutputDirectory); - si.setUseMapRed(true); - si.splitDirectory(getConfiguration(), tempSequenceDirectory); - - validateSplitInputMapReduce(numPoints, randomSelectionPct, keepPct); - } - - /** - * Validate that number of test records and number of training records - * are consistant with keepPct and randomSelectionPct - */ - private void validateSplitInputMapReduce(int numPoints, int randomSelectionPct, int keepPct) throws IOException { - Path testPath = new Path(tempMapRedOutputDirectory, "test-r-00000"); - Path trainingPath = new Path(tempMapRedOutputDirectory, "training-r-00000"); - int numberTestRecords = getNumberRecords(testPath); - int numberTrainingRecords = getNumberRecords(trainingPath); - System.out.printf("Test data: %d records\n", numberTestRecords); - displaySequenceFile(testPath); - System.out.printf("Training data: %d records\n", numberTrainingRecords); - displaySequenceFile(trainingPath); - assertEquals((randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints, - numberTestRecords, 2); - assertEquals( - (1 - randomSelectionPct / 100.0) * (keepPct / 100.0) * numPoints, - numberTrainingRecords, 2); - } - - @Test - public void testValidate() throws Exception { - SplitInput st = new SplitInput(); - assertValidateException(st); - - st.setTestSplitSize(100); - assertValidateException(st); - - st.setTestOutputDirectory(tempTestDirectory); - assertValidateException(st); - - st.setTrainingOutputDirectory(tempTrainingDirectory); - st.validate(); - - st.setTestSplitPct(50); - assertValidateException(st); - - st = new SplitInput(); - st.setTestRandomSelectionPct(50); - st.setTestOutputDirectory(tempTestDirectory); - st.setTrainingOutputDirectory(tempTrainingDirectory); - st.validate(); - - st.setTestSplitPct(50); - assertValidateException(st); - - st = new SplitInput(); - st.setTestRandomSelectionPct(50); - st.setTestOutputDirectory(tempTestDirectory); - st.setTrainingOutputDirectory(tempTrainingDirectory); - st.validate(); - - st.setTestSplitSize(100); - assertValidateException(st); - } - - private class TestCallback implements SplitInput.SplitCallback { - private final int testSplitSize; - private final int trainingLines; - - private TestCallback(int testSplitSize, int trainingLines) { - this.testSplitSize = testSplitSize; - this.trainingLines = trainingLines; - } - - @Override - public void splitComplete(Path inputFile, int lineCount, int trainCount, int testCount, int testSplitStart) { - assertSplit(fs, tempInputFile, charset, testSplitSize, trainingLines, tempTrainingDirectory, tempTestDirectory); - } - } - - private static void assertValidateException(SplitInput st) throws IOException { - try { - st.validate(); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException iae) { - // good - } - } - - private static void assertSplit(FileSystem fs, - Path tempInputFile, - Charset charset, - int testSplitSize, - int trainingLines, - Path tempTrainingDirectory, - Path tempTestDirectory) { - - try { - Path testFile = new Path(tempTestDirectory, tempInputFile.getName()); - //assertTrue("test file exists", testFile.isFile()); - assertEquals("test line count", testSplitSize, SplitInput.countLines(fs, testFile, charset)); - - Path trainingFile = new Path(tempTrainingDirectory, tempInputFile.getName()); - //assertTrue("training file exists", trainingFile.isFile()); - assertEquals("training line count", trainingLines, SplitInput.countLines(fs, trainingFile, charset)); - } catch (IOException ioe) { - fail(ioe.toString()); - } - } -}
http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java b/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java deleted file mode 100644 index c519f85..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/email/MailProcessorTest.java +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.email; - -import java.io.File; -import java.io.StringWriter; -import java.net.URL; -import java.util.regex.Pattern; - -import org.apache.commons.io.Charsets; -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Test; - -public final class MailProcessorTest extends MahoutTestCase { - - @Test - public void testLabel() throws Exception { - StringWriter writer = new StringWriter(); - MailOptions options = new MailOptions(); - options.setSeparator(":::"); - options.setCharset(Charsets.UTF_8); - options.setPatternsToMatch(new Pattern[]{ - MailProcessor.FROM_PREFIX, MailProcessor.SUBJECT_PREFIX, MailProcessor.TO_PREFIX}); - options.setInput(new File(System.getProperty("user.dir"))); - MailProcessor proc = new MailProcessor(options, "", writer); - URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox"); - File file = new File(url.toURI()); - long count = proc.parseMboxLineByLine(file); - assertEquals(7, count); - } - - @Test - public void testStripQuoted() throws Exception { - StringWriter writer = new StringWriter(); - MailOptions options = new MailOptions(); - options.setSeparator(":::"); - options.setCharset(Charsets.UTF_8); - options.setPatternsToMatch(new Pattern[]{ - MailProcessor.SUBJECT_PREFIX}); - options.setInput(new File(System.getProperty("user.dir"))); - options.setIncludeBody(true); - MailProcessor proc = new MailProcessor(options, "", writer); - URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox"); - File file = new File(url.toURI()); - long count = proc.parseMboxLineByLine(file); - assertEquals(7, count); - assertTrue(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering")); - writer = new StringWriter(); - proc = new MailProcessor(options, "", writer); - options.setStripQuotedText(true); - count = proc.parseMboxLineByLine(file); - assertEquals(7, count); - assertFalse(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering")); - - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java b/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java deleted file mode 100644 index 4fdbbbc..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java +++ /dev/null @@ -1,154 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.mahout.utils.nlp.collocations.llr; - -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.CharsetEncoder; - -import org.apache.commons.io.Charsets; -import org.apache.hadoop.util.bloom.BloomFilter; -import org.apache.hadoop.util.bloom.Filter; -import org.apache.hadoop.util.bloom.Key; -import org.apache.hadoop.util.hash.Hash; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.WhitespaceAnalyzer; -import org.apache.lucene.analysis.shingle.ShingleFilter; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Test; - -public final class BloomTokenFilterTest extends MahoutTestCase { - - private static final CharsetEncoder encoder = Charsets.UTF_8.newEncoder(); - - private static final String input = "The best of times the worst of times"; - private static final String[] allTokens = { - "The", "best", "of", "times", "the", "worst", "of", "times" - }; - private static final String[] expectedNonKeepTokens = { "best", "times", "the", "worst", "times" }; - private static final String[] expectedKeepTokens = { "The", "of", "of" }; - private static final String[] filterTokens = { "The", "of" }; - private static final String[] notFilterTokens = { "best", "worst", "the", "times"}; - private static final String[] shingleKeepTokens = { - "The best", "best of times", "the worst", "worst of times", "of times" - }; - private static final String[] expectedShingleTokens = { - "The best", "best of times", "of times", "the worst", "worst of times", "of times" - }; - - /** test standalone filter without tokenfilter wrapping */ - @Test - public void testFilter() throws IOException { - Filter filter = getFilter(filterTokens); - Key k = new Key(); - for (String s: filterTokens) { - setKey(k,s); - assertTrue("Key for string " + s + " should be filter member", filter.membershipTest(k)); - } - - for (String s: notFilterTokens) { - setKey(k,s); - assertFalse("Key for string " + s + " should not be filter member", filter.membershipTest(k)); - } - } - - /** normal case, unfiltered analyzer */ - @Test - public void testAnalyzer() throws IOException { - Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(); - TokenStream ts = analyzer.tokenStream(null, reader); - ts.reset(); - validateTokens(allTokens, ts); - ts.end(); - ts.close(); - } - - /** filtered analyzer */ - @Test - public void testNonKeepdAnalyzer() throws IOException { - Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(); - TokenStream ts = analyzer.tokenStream(null, reader); - ts.reset(); - TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts); - validateTokens(expectedNonKeepTokens, f); - ts.end(); - ts.close(); - } - - /** keep analyzer */ - @Test - public void testKeepAnalyzer() throws IOException { - Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(); - TokenStream ts = analyzer.tokenStream(null, reader); - ts.reset(); - TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts); - validateTokens(expectedKeepTokens, f); - ts.end(); - ts.close(); - } - - /** shingles, keep those matching whitelist */ - @Test - public void testShingleFilteredAnalyzer() throws IOException { - Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(); - TokenStream ts = analyzer.tokenStream(null, reader); - ts.reset(); - ShingleFilter sf = new ShingleFilter(ts, 3); - TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf); - validateTokens(expectedShingleTokens, f); - ts.end(); - ts.close(); - } - - private static void setKey(Key k, String s) throws IOException { - ByteBuffer buffer = encoder.encode(CharBuffer.wrap(s.toCharArray())); - k.set(buffer.array(), 1.0); - } - - private static void validateTokens(String[] expected, TokenStream ts) throws IOException { - int pos = 0; - while (ts.incrementToken()) { - assertTrue("Analyzer produced too many tokens", pos <= expected.length); - CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class); - assertEquals("Unexpected term", expected[pos++], termAttr.toString()); - } - assertEquals("Analyzer produced too few terms", expected.length, pos); - } - - private static Filter getFilter(String[] tokens) throws IOException { - Filter filter = new BloomFilter(100,50, Hash.JENKINS_HASH); - Key k = new Key(); - for (String s: tokens) { - setKey(k,s); - filter.add(k); - } - return filter; - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java b/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java deleted file mode 100644 index 8ab643b..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/regex/RegexMapperTest.java +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.mahout.common.DummyRecordWriter; -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Test; - -import java.util.List; - -public final class RegexMapperTest extends MahoutTestCase { - - @Test - public void testRegex() throws Exception { - RegexMapper mapper = new RegexMapper(); - Configuration conf = getConfiguration(); - conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)"); - conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName()); - DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>(); - Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter - .build(mapper, conf, mapWriter); - - mapper.setup(mapContext); - for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) { - String testStr = RegexUtilsTest.TEST_STRS[i]; - - LongWritable key = new LongWritable(i); - mapper.map(key, new Text(testStr), mapContext); - List<Text> value = mapWriter.getValue(key); - if (!RegexUtilsTest.GOLD[i].isEmpty()) { - assertEquals(1, value.size()); - assertEquals(RegexUtilsTest.GOLD[i], value.get(0).toString()); - } - } - } - - @Test - public void testGroups() throws Exception { - RegexMapper mapper = new RegexMapper(); - Configuration conf = getConfiguration(); - conf.set(RegexMapper.REGEX, "(\\d+)\\.(\\d+)\\.(\\d+)"); - conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName()); - conf.setStrings(RegexMapper.GROUP_MATCHERS, "1", "3"); - DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>(); - Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter - .build(mapper, conf, mapWriter); - - mapper.setup(mapContext); - for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) { - String testStr = RegexUtilsTest.TEST_STRS[i]; - - LongWritable key = new LongWritable(i); - mapper.map(key, new Text(testStr), mapContext); - List<Text> value = mapWriter.getValue(key); - assertEquals(1, value.size()); - assertEquals("127 0", value.get(0).toString()); - } - } - - @Test - public void testFPGFormatter() throws Exception { - RegexMapper mapper = new RegexMapper(); - Configuration conf = getConfiguration(); - conf.set(RegexMapper.REGEX, "(?<=(\\?|&)q=).*?(?=&|$)"); - conf.set(RegexMapper.TRANSFORMER_CLASS, URLDecodeTransformer.class.getName()); - conf.set(RegexMapper.FORMATTER_CLASS, FPGFormatter.class.getName()); - DummyRecordWriter<LongWritable, Text> mapWriter = new DummyRecordWriter<>(); - Mapper<LongWritable, Text, LongWritable, Text>.Context mapContext = DummyRecordWriter - .build(mapper, conf, mapWriter); - - mapper.setup(mapContext); - RegexFormatter formatter = new FPGFormatter(); - for (int i = 0; i < RegexUtilsTest.TEST_STRS.length; i++) { - String testStr = RegexUtilsTest.TEST_STRS[i]; - - LongWritable key = new LongWritable(i); - mapper.map(key, new Text(testStr), mapContext); - List<Text> value = mapWriter.getValue(key); - if (!RegexUtilsTest.GOLD[i].isEmpty()) { - assertEquals(1, value.size()); - assertEquals(formatter.format(RegexUtilsTest.GOLD[i]), value.get(0).toString()); - } - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java b/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java deleted file mode 100644 index 8ae10a5..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/regex/RegexUtilsTest.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.regex; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.regex.Pattern; - -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Test; - -public final class RegexUtilsTest extends MahoutTestCase { - - static final String[] TEST_STRS = { - "127.0.0.1 - - [01/10/2011:00:01:51 +0000] \"GET /solr/collection1/browse?q=foo&rows=10&wt=json&hl=true&hl.fl=body&hl.fl=content", - "127.0.0.1 - - [01/10/2011:00:20:58 +0000] \"GET /solr/collection1/browse?q=Using+Solr+Search+RDBMS&fq=%7B%21tag%3Dsource%7D%28%28source%3Alucid+AND+lucid_facet%3A%28site%29%29%29&rows=10", - "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET /solr/collection1/browse?q=language+detection&start=560&rows=10 HTTP/1.1\" 200 45071", - "127.0.0.1 - - [01/10/2011:00:21:21 +0000] \"GET /solr/collection1/browse?q=&start=560&rows=10 HTTP/1.1\" 200 45071" - }; - static final String[] GOLD = {"foo", "Using Solr Search RDBMS", "language detection", ""}; - - @Test - public void testExtract() throws Exception { - Pattern pattern = Pattern.compile("(?<=(\\?|&)q=).*?(?=&|$)"); - String line = "127.0.0.1 - - [24/05/2010:01:19:22 +0000] \"GET /solr/select?q=import statement&start=1 HTTP/1.1\" 200 37571"; - String res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER); - assertEquals(res, "import statement", res); - - for (int i = 0; i < TEST_STRS.length; i++) { - String testStr = TEST_STRS[i]; - res = RegexUtils.extract(testStr, pattern, Collections.<Integer>emptyList(), " ", new URLDecodeTransformer()); - assertEquals(GOLD[i], res); - } - - pattern = Pattern.compile("((?<=(\\?|&)q=)(.*?)(?=(&|$))|(?<=((\\?|&)start=))(\\d+))"); - res = RegexUtils.extract(line, pattern, Collections.<Integer>emptyList(), " ", RegexUtils.IDENTITY_TRANSFORMER); - assertEquals(res, "import statement 1", res); - - pattern = Pattern.compile("(start=1) HTTP"); - Collection<Integer> groupsToKeep = new ArrayList<>(); - groupsToKeep.add(1); - res = RegexUtils.extract(line, pattern, groupsToKeep, " ", RegexUtils.IDENTITY_TRANSFORMER); - assertEquals(res, "start=1", res); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java b/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java deleted file mode 100644 index 2ddce14..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors; - -import java.util.Iterator; -import java.util.Random; - -import com.google.common.base.Function; -import com.google.common.collect.Iterators; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.iterator.CountingIterator; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.function.DoubleFunction; - -public final class RandomVectorIterable implements Iterable<Vector> { - - public enum VectorType {DENSE, SPARSE} - - private final int numItems; - private final VectorType type; - - public RandomVectorIterable() { - this(100, VectorType.SPARSE); - } - - public RandomVectorIterable(int numItems) { - this(numItems, VectorType.SPARSE); - } - - public RandomVectorIterable(int numItems, VectorType type) { - this.numItems = numItems; - this.type = type; - } - - @Override - public Iterator<Vector> iterator() { - return Iterators.transform( - new CountingIterator(numItems), - new Function<Integer, Vector>() { - private final Random random = RandomUtils.getRandom(); - @Override - public Vector apply(Integer dummy) { - Vector result = - type == VectorType.SPARSE ? new RandomAccessSparseVector(numItems) : new DenseVector(numItems); - result.assign(new DoubleFunction() { - @Override - public double apply(double ignored) { - return random.nextDouble(); - } - }); - return result; - } - }); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java deleted file mode 100644 index c55fd8d..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors; - -import java.util.Random; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.SequentialAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.junit.Before; -import org.junit.Test; - -public final class VectorHelperTest extends MahoutTestCase { - - private static final int NUM_DOCS = 100; - - private Path inputPathOne; - private Path inputPathTwo; - - private Configuration conf; - - @Override - @Before - public void setUp() throws Exception { - super.setUp(); - conf = getConfiguration(); - - inputPathOne = getTestTempFilePath("documents/docs-one.file"); - FileSystem fs = FileSystem.get(inputPathOne.toUri(), conf); - try (SequenceFile.Writer writer = - new SequenceFile.Writer(fs, conf, inputPathOne, Text.class, IntWritable.class)) { - Random rd = RandomUtils.getRandom(); - for (int i = 0; i < NUM_DOCS; i++) { - // Make all indices higher than dictionary size - writer.append(new Text("Document::ID::" + i), new IntWritable(NUM_DOCS + rd.nextInt(NUM_DOCS))); - } - } - - inputPathTwo = getTestTempFilePath("documents/docs-two.file"); - fs = FileSystem.get(inputPathTwo.toUri(), conf); - try (SequenceFile.Writer writer = - new SequenceFile.Writer(fs, conf, inputPathTwo, Text.class, IntWritable.class)) { - Random rd = RandomUtils.getRandom(); - for (int i = 0; i < NUM_DOCS; i++) { - // Keep indices within number of documents - writer.append(new Text("Document::ID::" + i), new IntWritable(rd.nextInt(NUM_DOCS))); - } - } - } - - @Test - public void testJsonFormatting() throws Exception { - Vector v = new SequentialAccessSparseVector(10); - v.set(2, 3.1); - v.set(4, 1.0); - v.set(6, 8.1); - v.set(7, -100); - v.set(9, 12.2); - String UNUSED = "UNUSED"; - String[] dictionary = { - UNUSED, UNUSED, "two", UNUSED, "four", UNUSED, "six", "seven", UNUSED, "nine" - }; - - assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1}", - VectorHelper.vectorToJson(v, dictionary, 3, true)); - assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}", - VectorHelper.vectorToJson(v, dictionary, 2, false)); - assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0}", - VectorHelper.vectorToJson(v, dictionary, 4, true)); - assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0,seven:-100.0}", - VectorHelper.vectorToJson(v, dictionary, 5, true)); - assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1}", - VectorHelper.vectorToJson(v, dictionary, 2, true)); - assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}", - VectorHelper.vectorToJson(v, dictionary, 2, false)); - } - - @Test - public void testTopEntries() throws Exception { - Vector v = new SequentialAccessSparseVector(10); - v.set(2, 3.1); - v.set(4, 1.0); - v.set(6, 8.1); - v.set(7, -100); - v.set(9, 12.2); - v.set(1, 0.0); - v.set(3, 0.0); - v.set(8, 2.7); - // check if sizeOFNonZeroElementsInVector = maxEntries - assertEquals(6, VectorHelper.topEntries(v, 6).size()); - // check if sizeOfNonZeroElementsInVector < maxEntries - assertTrue(VectorHelper.topEntries(v, 9).size() < 9); - // check if sizeOfNonZeroElementsInVector > maxEntries - assertTrue(VectorHelper.topEntries(v, 5).size() < v.getNumNonZeroElements()); - } - - @Test - public void testTopEntriesWhenAllZeros() throws Exception { - Vector v = new SequentialAccessSparseVector(10); - v.set(2, 0.0); - v.set(4, 0.0); - v.set(6, 0.0); - v.set(7, 0); - v.set(9, 0.0); - v.set(1, 0.0); - v.set(3, 0.0); - v.set(8, 0.0); - assertEquals(0, VectorHelper.topEntries(v, 6).size()); - } - - @Test - public void testLoadTermDictionary() throws Exception { - // With indices higher than dictionary size - VectorHelper.loadTermDictionary(conf, inputPathOne.toString()); - // With dictionary size higher than indices - VectorHelper.loadTermDictionary(conf, inputPathTwo.toString()); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java deleted file mode 100644 index 2ea8b89..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFTypeTest.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.arff; - -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Test; - -public final class ARFFTypeTest extends MahoutTestCase { - - @Test - public void removeQuotes() { - assertNull(ARFFType.removeQuotes(null)); - assertEquals("", ARFFType.removeQuotes("\"\"")); - assertEquals("", ARFFType.removeQuotes("''")); - assertEquals("", ARFFType.removeQuotes("")); - assertEquals("", ARFFType.removeQuotes(" ")); - assertEquals("single", ARFFType.removeQuotes("'single'")); - assertEquals("double", ARFFType.removeQuotes("\"double\"")); - assertEquals("trim", ARFFType.removeQuotes(" trim ")); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java deleted file mode 100644 index 4c7f17a..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java +++ /dev/null @@ -1,289 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.arff; - -import java.io.IOException; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.Iterator; -import java.util.Locale; -import java.util.Map; - -import com.google.common.io.Resources; -import org.apache.commons.io.Charsets; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.junit.Test; - -public final class ARFFVectorIterableTest extends MahoutTestCase { - - @Test - public void testValues() throws Exception { - ARFFVectorIterable iterable = readModelFromResource("sample.arff"); - - assertEquals("Mahout", iterable.getModel().getRelation()); - Map<String, Integer> bindings = iterable.getModel().getLabelBindings(); - assertNotNull(bindings); - assertEquals(5, bindings.size()); - Iterator<Vector> iter = iterable.iterator(); - assertTrue(iter.hasNext()); - Vector next = iter.next(); - assertNotNull(next); - assertTrue("Wrong instanceof", next instanceof DenseVector); - assertEquals(1.0, next.get(0), EPSILON); - assertEquals(2.0, next.get(1), EPSILON); - assertTrue(iter.hasNext()); - next = iter.next(); - assertNotNull(next); - assertTrue("Wrong instanceof", next instanceof DenseVector); - assertEquals(2.0, next.get(0), EPSILON); - assertEquals(3.0, next.get(1), EPSILON); - - assertTrue(iter.hasNext()); - next = iter.next(); - assertNotNull(next); - assertTrue("Wrong instanceof", next instanceof RandomAccessSparseVector); - assertEquals(5.0, next.get(0), EPSILON); - assertEquals(23.0, next.get(1), EPSILON); - - assertFalse(iter.hasNext()); - } - - @Test - public void testDense() throws Exception { - Iterable<Vector> iterable = readModelFromResource("sample-dense.arff"); - Vector firstVector = iterable.iterator().next(); - assertEquals(1.0, firstVector.get(0), 0); - assertEquals(65.0, firstVector.get(1), 0); - assertEquals(1.0, firstVector.get(3), 0); - assertEquals(1.0, firstVector.get(4), 0); - - int count = 0; - for (Vector vector : iterable) { - assertTrue("Vector is not dense", vector instanceof DenseVector); - count++; - } - assertEquals(5, count); - } - - @Test - public void testSparse() throws Exception { - Iterable<Vector> iterable = readModelFromResource("sample-sparse.arff"); - - Vector firstVector = iterable.iterator().next(); - assertEquals(23.1, firstVector.get(1), 0); - assertEquals(3.23, firstVector.get(2), 0); - assertEquals(1.2, firstVector.get(3), 0); - - int count = 0; - for (Vector vector : iterable) { - assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); - count++; - } - assertEquals(9, count); - } - - @Test - public void testNonNumeric() throws Exception { - MapBackedARFFModel model = new MapBackedARFFModel(); - ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model); - int count = 0; - for (Vector vector : iterable) { - assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); - count++; - } - - iterable = getVectors("non-numeric-1.arff", model); - Iterator<Vector> iter = iterable.iterator(); - Vector firstVector = iter.next(); - - assertEquals(1.0, firstVector.get(2), 0); - - assertEquals(10, count); - Map<String, Map<String, Integer>> nominalMap = iterable.getModel().getNominalMap(); - assertNotNull(nominalMap); - assertEquals(1, nominalMap.size()); - Map<String, Integer> noms = nominalMap.get("bar"); - assertNotNull("nominals for bar are null", noms); - assertEquals(5, noms.size()); - Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap(); - assertNotNull("Type map null", integerARFFTypeMap); - assertEquals(5, integerARFFTypeMap.size()); - Map<String, Long> words = model.getWords(); - assertNotNull("words null", words); - assertEquals(10, words.size()); - Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap(); - assertNotNull("date format null", integerDateFormatMap); - assertEquals(1, integerDateFormatMap.size()); - } - - @Test - public void testDate() throws Exception { - ARFFVectorIterable iterable = readModelFromResource("date.arff"); - Iterator<Vector> iter = iterable.iterator(); - Vector firstVector = iter.next(); - - DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); - Date date = format.parse("2001-07-04T12:08:56"); - long result = date.getTime(); - assertEquals(result, firstVector.get(1), 0); - - format = new SimpleDateFormat("yyyy.MM.dd G 'at' HH:mm:ss z", Locale.ENGLISH); - date = format.parse("2001.07.04 AD at 12:08:56 PDT"); - result = date.getTime(); - assertEquals(result, firstVector.get(2), 0); - - format = new SimpleDateFormat("EEE, MMM d, ''yy", Locale.ENGLISH); - date = format.parse("Wed, Jul 4, '01,4 0:08 PM, PDT"); - result = date.getTime(); - assertEquals(result, firstVector.get(3), 0); - - format = new SimpleDateFormat("K:mm a, z", Locale.ENGLISH); - date = format.parse("0:08 PM, PDT"); - result = date.getTime(); - assertEquals(result, firstVector.get(4), 0); - - format = new SimpleDateFormat("yyyyy.MMMMM.dd GGG hh:mm aaa", Locale.ENGLISH); - date = format.parse("02001.July.04 AD 12:08 PM"); - result = date.getTime(); - assertEquals(result, firstVector.get(5), 0); - - format = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.ENGLISH); - date = format.parse("Wed, 4 Jul 2001 12:08:56 -0700"); - result = date.getTime(); - assertEquals(result, firstVector.get(6), 0); - - } - - @Test - public void testMultipleNoms() throws Exception { - MapBackedARFFModel model = new MapBackedARFFModel(); - ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model); - int count = 0; - for (Vector vector : iterable) { - assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); - count++; - } - assertEquals(10, count); - Map<String,Map<String,Integer>> nominalMap = iterable.getModel().getNominalMap(); - assertNotNull(nominalMap); - assertEquals(1, nominalMap.size()); - Map<String,Integer> noms = nominalMap.get("bar"); - assertNotNull("nominals for bar are null", noms); - assertEquals(5, noms.size()); - Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap(); - assertNotNull("Type map null", integerARFFTypeMap); - assertEquals(5, integerARFFTypeMap.size()); - Map<String,Long> words = model.getWords(); - assertNotNull("words null", words); - assertEquals(10, words.size()); - - Map<Integer,DateFormat> integerDateFormatMap = model.getDateMap(); - assertNotNull("date format null", integerDateFormatMap); - assertEquals(1, integerDateFormatMap.size()); - - - iterable = getVectors("non-numeric-2.arff", model); - count = 0; - for (Vector vector : iterable) { - assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); - count++; - } - nominalMap = model.getNominalMap(); - assertNotNull(nominalMap); - assertEquals(2, nominalMap.size()); - noms = nominalMap.get("test"); - assertNotNull("nominals for bar are null", noms); - assertEquals(2, noms.size()); - } - - @Test - public void testNumerics() throws Exception { - String arff = "@RELATION numerics\n" - + "@ATTRIBUTE theNumeric NUMERIC\n" - + "@ATTRIBUTE theInteger INTEGER\n" - + "@ATTRIBUTE theReal REAL\n" - + "@DATA\n" - + "1.0,2,3.0"; - ARFFModel model = new MapBackedARFFModel(); - ARFFVectorIterable iterable = new ARFFVectorIterable(arff, model); - model = iterable.getModel(); - assertNotNull(model); - assertEquals(3, model.getLabelSize()); - assertEquals(ARFFType.NUMERIC, model.getARFFType(0)); - assertEquals(ARFFType.INTEGER, model.getARFFType(1)); - assertEquals(ARFFType.REAL, model.getARFFType(2)); - Iterator<Vector> it = iterable.iterator(); - Vector vector = it.next(); - assertEquals(1.0, vector.get(0), EPSILON); - assertEquals(2.0, vector.get(1), EPSILON); - assertEquals(3.0, vector.get(2), EPSILON); - } - - @Test - public void testQuotes() throws Exception { - // ARFF allows quotes on identifiers - ARFFModel model = new MapBackedARFFModel(); - ARFFVectorIterable iterable = getVectors("quoted-id.arff", model); - model = iterable.getModel(); - assertNotNull(model); - assertEquals("quotes", model.getRelation()); - - // check attribute labels - assertEquals(4, model.getLabelSize()); - assertEquals(ARFFType.NUMERIC, model.getARFFType(0)); - assertEquals(ARFFType.INTEGER, model.getARFFType(1)); - assertEquals(ARFFType.REAL, model.getARFFType(2)); - assertEquals(ARFFType.NOMINAL, model.getARFFType(3)); - - Map<String, Integer> labelBindings = model.getLabelBindings(); - assertTrue(labelBindings.keySet().contains("thenumeric")); - assertTrue(labelBindings.keySet().contains("theinteger")); - assertTrue(labelBindings.keySet().contains("thereal")); - assertTrue(labelBindings.keySet().contains("thenominal")); - - // check nominal values - Map<String, Integer> nominalMap = model.getNominalMap().get("thenominal"); - assertNotNull(nominalMap); - assertEquals(3, nominalMap.size()); - assertTrue(nominalMap.keySet().contains("double-quote")); - assertTrue(nominalMap.keySet().contains("single-quote")); - assertTrue(nominalMap.keySet().contains("no-quote")); - - // check data values - Iterator<Vector> it = iterable.iterator(); - Vector vector = it.next(); - assertEquals(nominalMap.get("no-quote"), vector.get(3), EPSILON); - assertEquals(nominalMap.get("single-quote"), it.next().get(3), EPSILON); - assertEquals(nominalMap.get("double-quote"), it.next().get(3), EPSILON); - } - - static ARFFVectorIterable getVectors(String resourceName, ARFFModel model) throws IOException { - String sample = Resources.toString(Resources.getResource(resourceName), Charsets.UTF_8); - return new ARFFVectorIterable(sample, model); - } - - private static ARFFVectorIterable readModelFromResource(String resourceName) throws IOException { - ARFFModel model = new MapBackedARFFModel(); - return getVectors(resourceName, model); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java deleted file mode 100644 index 7e7623e..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/DriverTest.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.utils.vectors.arff; - -import java.io.IOException; -import java.io.StringWriter; - -import com.google.common.io.Resources; -import org.apache.commons.io.Charsets; -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Test; - -/** - * Test case for {@link Driver} - */ -public class DriverTest extends MahoutTestCase { - - @Test - public void dictionary() throws IOException { - - ARFFModel model = new MapBackedARFFModel(); - ARFFVectorIterableTest.getVectors("sample-dense.arff", model); - StringWriter writer = new StringWriter(); - Driver.writeLabelBindings(writer, model, ","); - String expected1 = Resources.toString(Resources.getResource("expected-arff-dictionary.csv"), Charsets.UTF_8); - String expected2 = Resources.toString(Resources.getResource("expected-arff-dictionary-2.csv"), Charsets.UTF_8); - assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString())); - } - - - @Test - public void dictionaryJSON() throws IOException { - ARFFModel model = new MapBackedARFFModel(); - ARFFVectorIterableTest.getVectors("sample-dense.arff", model); - StringWriter writer = new StringWriter(); - Driver.writeLabelBindingsJSON(writer, model); - String expected1 = Resources.toString(Resources.getResource("expected-arff-schema.json"), Charsets.UTF_8); - String expected2 = Resources.toString(Resources.getResource("expected-arff-schema-2.json"), Charsets.UTF_8); - assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString())); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java deleted file mode 100644 index 2867640..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModelTest.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2013 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.arff; - -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Test; - -import java.util.Map; - -public class MapBackedARFFModelTest extends MahoutTestCase { - - @Test - public void processNominal() { - String windy = "windy"; - String breezy = "breezy"; - - ARFFModel model = new MapBackedARFFModel(); - model.addNominal(windy, breezy, 77); - model.addNominal(windy, "strong", 23); - model.addNominal(windy, "nuking", 55); - Map<String, Map<String, Integer>> nominalMap = model.getNominalMap(); - - assertEquals(1, nominalMap.size()); - Map<String, Integer> windyValues = nominalMap.get(windy); - assertEquals(77, windyValues.get(breezy).intValue()); - } - - @Test - public void processBadNumeric() { - ARFFModel model = new MapBackedARFFModel(); - model.addLabel("b1shkt70694difsmmmdv0ikmoh", 77); - model.addType(77, ARFFType.REAL); - assertTrue(Double.isNaN(model.getValue("b1shkt70694difsmmmdv0ikmoh", 77))); - } - - @Test - public void processGoodNumeric() { - ARFFModel model = new MapBackedARFFModel(); - model.addLabel("1234", 77); - model.addType(77, ARFFType.INTEGER); - assertTrue(1234 == model.getValue("1234", 77)); - model.addLabel("131.34", 78); - model.addType(78, ARFFType.REAL); - assertTrue(131.34 == model.getValue("131.34", 78)); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java deleted file mode 100644 index e76cf70..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIteratorTest.java +++ /dev/null @@ -1,57 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.csv; - -import java.io.IOException; -import java.io.StringReader; -import java.io.StringWriter; -import java.util.Iterator; - -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.math.Vector; -import org.apache.mahout.utils.vectors.RandomVectorIterable; -import org.apache.mahout.utils.vectors.VectorHelper; -import org.apache.mahout.utils.vectors.io.TextualVectorWriter; -import org.junit.Test; - -public class CSVVectorIteratorTest extends MahoutTestCase { - - @Test - public void testCount() throws Exception { - - StringWriter sWriter = new StringWriter(); - try (TextualVectorWriter writer = new TextualVectorWriter(sWriter) { - @Override - public void write(Vector vector) throws IOException { - String vecStr = VectorHelper.vectorToCSVString(vector, false); - getWriter().write(vecStr); - } - }) { - Iterable<Vector> iter = new RandomVectorIterable(50); - writer.write(iter); - } - - Iterator<Vector> csvIter = new CSVVectorIterator(new StringReader(sWriter.getBuffer().toString())); - int count = 0; - while (csvIter.hasNext()) { - csvIter.next(); - count++; - } - assertEquals(50, count); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java deleted file mode 100644 index e2f7032..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.io; - -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Collection; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.mahout.common.HadoopUtil; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.utils.vectors.RandomVectorIterable; -import org.junit.Test; - -public final class VectorWriterTest extends MahoutTestCase { - - @Test - public void testSFVW() throws Exception { - Path path = getTestTempFilePath("sfvw"); - Configuration conf = getConfiguration(); - FileSystem fs = FileSystem.get(conf); - SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class); - try (SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter)) { - writer.write(new RandomVectorIterable(50)); - } - - long count = HadoopUtil.countRecords(path, conf); - assertEquals(50, count); - } - - @Test - public void testTextOutputSize() throws Exception { - StringWriter strWriter = new StringWriter(); - try (VectorWriter writer = new TextualVectorWriter(strWriter)) { - Collection<Vector> vectors = new ArrayList<>(); - vectors.add(new DenseVector(new double[]{0.3, 1.5, 4.5})); - vectors.add(new DenseVector(new double[]{1.3, 1.5, 3.5})); - writer.write(vectors); - } - String buffer = strWriter.toString(); - assertNotNull(buffer); - assertFalse(buffer.isEmpty()); - - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java deleted file mode 100644 index 890a14b..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - - -import java.io.IOException; - -import com.google.common.io.Closeables; - -import org.apache.lucene.analysis.core.WhitespaceAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.StringField; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.RAMDirectory; -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Before; -import org.junit.Test; - -public class CachedTermInfoTest extends MahoutTestCase { - private RAMDirectory directory; - private static final String[] DOCS = { - "a a b b c c", - "a b a b a b a b", - "a b a", - "a", - "b", - "a", - "a" - }; - - private static final String[] DOCS2 = { - "d d d d", - "e e e e", - "d e d e", - "d", - "e", - "d", - "e" - }; - - @Before - public void before() throws IOException { - directory = new RAMDirectory(); - - FieldType fieldType = new FieldType(); - fieldType.setStored(false); - fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - fieldType.setTokenized(true); - fieldType.setStoreTermVectors(false); - fieldType.setStoreTermVectorPositions(false); - fieldType.setStoreTermVectorOffsets(false); - fieldType.freeze(); - - directory = createTestIndex(fieldType, directory, 0); - } - - @Test - public void test() throws Exception { - IndexReader reader = DirectoryReader.open(directory); - CachedTermInfo cti = new CachedTermInfo(reader, "content", 0, 100); - assertEquals(3, cti.totalTerms("content")); - assertNotNull(cti.getTermEntry("content", "a")); - assertNull(cti.getTermEntry("content", "e")); - //minDf - cti = new CachedTermInfo(reader, "content", 3, 100); - assertEquals(2, cti.totalTerms("content")); - assertNotNull(cti.getTermEntry("content", "a")); - assertNull(cti.getTermEntry("content", "c")); - //maxDFPercent, a is in 6 of 7 docs: numDocs * maxDfPercent / 100 < 6 to exclude, 85% should suffice to exclude a - cti = new CachedTermInfo(reader, "content", 0, 85); - assertEquals(2, cti.totalTerms("content")); - assertNotNull(cti.getTermEntry("content", "b")); - assertNotNull(cti.getTermEntry("content", "c")); - assertNull(cti.getTermEntry("content", "a")); - - - } - - static RAMDirectory createTestIndex(FieldType fieldType, - RAMDirectory directory, - int startingId) throws IOException { - IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new WhitespaceAnalyzer())); - - try { - for (int i = 0; i < DOCS.length; i++) { - Document doc = new Document(); - Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES); - doc.add(id); - Field text = new Field("content", DOCS[i], fieldType); - doc.add(text); - Field text2 = new Field("content2", DOCS2[i], fieldType); - doc.add(text2); - writer.addDocument(doc); - } - } finally { - Closeables.close(writer, false); - } - return directory; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java deleted file mode 100644 index 86c8305..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java +++ /dev/null @@ -1,136 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import com.google.common.collect.Sets; -import com.google.common.io.Closeables; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.SimpleFSDirectory; -import org.apache.mahout.common.MahoutTestCase; -import org.junit.Before; -import org.junit.Test; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Paths; -import java.util.Set; - -public class DriverTest extends MahoutTestCase { - - private File indexDir; - private File outputDir; - private Configuration conf; - - @Before - @Override - public void setUp() throws Exception { - super.setUp(); - indexDir = getTestTempDir("intermediate"); - indexDir.delete(); - outputDir = getTestTempDir("output"); - outputDir.delete(); - - conf = getConfiguration(); - } - - private Document asDocument(String line) { - Document doc = new Document(); - doc.add(new TextFieldWithTermVectors("text", line)); - return doc; - } - - static class TextFieldWithTermVectors extends Field { - - public static final FieldType TYPE = new FieldType(); - - static { - TYPE.setOmitNorms(true); - TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS); - TYPE.setStored(true); - TYPE.setTokenized(true); - TYPE.setStoreTermVectors(true); - TYPE.freeze(); - } - - public TextFieldWithTermVectors(String name, String value) { - super(name, value, TYPE); - } - } - - @Test - public void sequenceFileDictionary() throws IOException { - - Directory index = new SimpleFSDirectory(Paths.get(indexDir.getAbsolutePath())); - Analyzer analyzer = new StandardAnalyzer(); - IndexWriterConfig config = new IndexWriterConfig(analyzer); - config.setCommitOnClose(true); - final IndexWriter writer = new IndexWriter(index, config); - - try { - writer.addDocument(asDocument("One Ring to rule them all")); - writer.addDocument(asDocument("One Ring to find them,")); - writer.addDocument(asDocument("One Ring to bring them all")); - writer.addDocument(asDocument("and in the darkness bind them")); - } finally { - writer.close(); - } - - File seqDict = new File(outputDir, "dict.seq"); - - Driver.main(new String[] { - "--dir", indexDir.getAbsolutePath(), - "--output", new File(outputDir, "out").getAbsolutePath(), - "--field", "text", - "--dictOut", new File(outputDir, "dict.txt").getAbsolutePath(), - "--seqDictOut", seqDict.getAbsolutePath(), - }); - - SequenceFile.Reader reader = null; - Set<String> indexTerms = Sets.newHashSet(); - try { - reader = new SequenceFile.Reader(FileSystem.getLocal(conf), new Path(seqDict.getAbsolutePath()), conf); - Text term = new Text(); - IntWritable termIndex = new IntWritable(); - - while (reader.next(term, termIndex)) { - indexTerms.add(term.toString()); - } - } finally { - Closeables.close(reader, true); - } - - Set<String> expectedIndexTerms = Sets.newHashSet("all", "bind", "bring", "darkness", "find", "one", "ring", "rule"); - - // should contain the same terms as expected - assertEquals(expectedIndexTerms.size(), Sets.union(expectedIndexTerms, indexTerms).size()); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java deleted file mode 100644 index 8d92551..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java +++ /dev/null @@ -1,195 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils.vectors.lucene; - -import java.io.IOException; -import java.util.Iterator; - -import com.google.common.collect.Iterables; -import com.google.common.collect.Iterators; - -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.StringField; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.RAMDirectory; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.utils.vectors.TermInfo; -import org.apache.mahout.vectorizer.TFIDF; -import org.apache.mahout.vectorizer.Weight; -import org.junit.Before; -import org.junit.Test; - -public final class LuceneIterableTest extends MahoutTestCase { - - private static final String [] DOCS = { - "The quick red fox jumped over the lazy brown dogs.", - "Mary had a little lamb whose fleece was white as snow.", - "Moby Dick is a story of a whale and a man obsessed.", - "The robber wore a black fleece jacket and a baseball cap.", - "The English Springer Spaniel is the best of all dogs." - }; - - private RAMDirectory directory; - - private final FieldType TYPE_NO_TERM_VECTORS = new FieldType(); - - private final FieldType TYPE_TERM_VECTORS = new FieldType(); - - @Before - public void before() throws IOException { - - TYPE_NO_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - TYPE_NO_TERM_VECTORS.setTokenized(true); - TYPE_NO_TERM_VECTORS.setStoreTermVectors(false); - TYPE_NO_TERM_VECTORS.setStoreTermVectorPositions(false); - TYPE_NO_TERM_VECTORS.setStoreTermVectorOffsets(false); - TYPE_NO_TERM_VECTORS.freeze(); - - TYPE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - TYPE_TERM_VECTORS.setTokenized(true); - TYPE_TERM_VECTORS.setStored(true); - TYPE_TERM_VECTORS.setStoreTermVectors(true); - TYPE_TERM_VECTORS.setStoreTermVectorPositions(true); - TYPE_TERM_VECTORS.setStoreTermVectorOffsets(true); - TYPE_TERM_VECTORS.freeze(); - - directory = createTestIndex(TYPE_TERM_VECTORS); - } - - @Test - public void testIterable() throws Exception { - IndexReader reader = DirectoryReader.open(directory); - Weight weight = new TFIDF(); - TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); - LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight); - - //TODO: do something more meaningful here - for (Vector vector : iterable) { - assertNotNull(vector); - assertTrue("vector is not an instanceof " + NamedVector.class, vector instanceof NamedVector); - assertTrue("vector Size: " + vector.size() + " is not greater than: " + 0, vector.size() > 0); - assertTrue(((NamedVector) vector).getName().startsWith("doc_")); - } - - iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, 3); - - //TODO: do something more meaningful here - for (Vector vector : iterable) { - assertNotNull(vector); - assertTrue("vector is not an instanceof " + NamedVector.class, vector instanceof NamedVector); - assertTrue("vector Size: " + vector.size() + " is not greater than: " + 0, vector.size() > 0); - assertTrue(((NamedVector) vector).getName().startsWith("doc_")); - } - - } - - @Test(expected = IllegalStateException.class) - public void testIterableNoTermVectors() throws IOException { - RAMDirectory directory = createTestIndex(TYPE_NO_TERM_VECTORS); - IndexReader reader = DirectoryReader.open(directory); - - Weight weight = new TFIDF(); - TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); - LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight); - - Iterator<Vector> iterator = iterable.iterator(); - Iterators.advance(iterator, 1); - } - - @Test - public void testIterableSomeNoiseTermVectors() throws IOException { - //get noise vectors - RAMDirectory directory = createTestIndex(TYPE_TERM_VECTORS, new RAMDirectory(), 0); - //get real vectors - createTestIndex(TYPE_NO_TERM_VECTORS, directory, 5); - IndexReader reader = DirectoryReader.open(directory); - - Weight weight = new TFIDF(); - TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); - - boolean exceptionThrown; - //0 percent tolerance - LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo, weight); - try { - Iterables.skip(iterable, Iterables.size(iterable)); - exceptionThrown = false; - } - catch(IllegalStateException ise) { - exceptionThrown = true; - } - assertTrue(exceptionThrown); - - //100 percent tolerance - iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, -1, 1.0); - try { - Iterables.skip(iterable, Iterables.size(iterable)); - exceptionThrown = false; - } - catch(IllegalStateException ise) { - exceptionThrown = true; - } - assertFalse(exceptionThrown); - - //50 percent tolerance - iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, -1, 0.5); - Iterator<Vector> iterator = iterable.iterator(); - Iterators.advance(iterator, 5); - - try { - Iterators.advance(iterator, Iterators.size(iterator)); - exceptionThrown = false; - } - catch(IllegalStateException ise) { - exceptionThrown = true; - } - assertTrue(exceptionThrown); - } - - static RAMDirectory createTestIndex(FieldType fieldType) throws IOException { - return createTestIndex(fieldType, new RAMDirectory(), 0); - } - - static RAMDirectory createTestIndex(FieldType fieldType, - RAMDirectory directory, - int startingId) throws IOException { - - try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new StandardAnalyzer()))) { - for (int i = 0; i < DOCS.length; i++) { - Document doc = new Document(); - Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES); - doc.add(id); - //Store both position and offset information - Field text = new Field("content", DOCS[i], fieldType); - doc.add(text); - Field text2 = new Field("content2", DOCS[i], fieldType); - doc.add(text2); - writer.addDocument(doc); - } - } - return directory; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/resources/date.arff ---------------------------------------------------------------------- diff --git a/integration/src/test/resources/date.arff b/integration/src/test/resources/date.arff deleted file mode 100644 index 9daeb52..0000000 --- a/integration/src/test/resources/date.arff +++ /dev/null @@ -1,18 +0,0 @@ - % Comments - % - % Comments go here % - @RELATION MahoutDateTest - - @ATTRIBUTE junk NUMERIC - @ATTRIBUTE date1 date - @ATTRIBUTE date2 date "yyyy.MM.dd G 'at' HH:mm:ss z" - @ATTRIBUTE date3 date "EEE, MMM d, ''yy" - @ATTRIBUTE date4 date "K:mm a, z" - @ATTRIBUTE date5 date "yyyyy.MMMMM.dd GGG hh:mm aaa" - @ATTRIBUTE date6 date "EEE, d MMM yyyy HH:mm:ss Z" - - - - @DATA - {0 1,1 "2001-07-04T12:08:56",2 "2001.07.04 AD at 12:08:56 PDT",3 "Wed, Jul 4, '01,4 0:08 PM, PDT",4 "0:08 PM, PDT", 5 "02001.July.04 AD 12:08 PM" ,6 "Wed, 4 Jul 2001 12:08:56 -0700" } - {0 2,1 "2001-08-04T12:09:56",2 "2011.07.04 AD at 12:08:56 PDT",3 "Mon, Jul 4, '11,4 0:08 PM, PDT",4 "0:08 PM, PDT", 5 "02001.July.14 AD 12:08 PM" ,6 "Mon, 4 Jul 2011 12:08:56 -0700" } http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/resources/expected-arff-dictionary-2.csv ---------------------------------------------------------------------- diff --git a/integration/src/test/resources/expected-arff-dictionary-2.csv b/integration/src/test/resources/expected-arff-dictionary-2.csv deleted file mode 100644 index acb1c43..0000000 --- a/integration/src/test/resources/expected-arff-dictionary-2.csv +++ /dev/null @@ -1,22 +0,0 @@ -Label bindings for Relation golf -temperature,1 -humidity,2 -outlook,0 -class,4 -windy,3 - -Values for nominal attributes -3 -outlook -3 -rain,3 -overcast,2 -sunny,1 -class -2 -play,2 -dont_play,1 -windy -2 -false,1 -true,2 http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/resources/expected-arff-dictionary.csv ---------------------------------------------------------------------- diff --git a/integration/src/test/resources/expected-arff-dictionary.csv b/integration/src/test/resources/expected-arff-dictionary.csv deleted file mode 100644 index f2dac13..0000000 --- a/integration/src/test/resources/expected-arff-dictionary.csv +++ /dev/null @@ -1,22 +0,0 @@ -Label bindings for Relation golf -humidity,2 -windy,3 -outlook,0 -class,4 -temperature,1 - -Values for nominal attributes -3 -windy -2 -true,2 -false,1 -outlook -3 -sunny,1 -overcast,2 -rain,3 -class -2 -play,2 -dont_play,1 http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/test/resources/expected-arff-schema-2.json ---------------------------------------------------------------------- diff --git a/integration/src/test/resources/expected-arff-schema-2.json b/integration/src/test/resources/expected-arff-schema-2.json deleted file mode 100644 index b73f55c..0000000 --- a/integration/src/test/resources/expected-arff-schema-2.json +++ /dev/null @@ -1 +0,0 @@ -[{"values":["rain","overcast","sunny"],"label":"false","attribute":"outlook","type":"categorical"},{"label":"false","attribute":"temperature","type":"numerical"},{"label":"false","attribute":"humidity","type":"numerical"},{"values":["false","true"],"label":"false","attribute":"windy","type":"categorical"},{"values":["play","dont_play"],"label":"true","attribute":"class","type":"categorical"}] \ No newline at end of file
